From 3a3dc72b5991321d7451a3d370071322f0e829c7 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Sun, 19 May 2024 16:44:00 -0700 Subject: [PATCH] Fix invalid memory access issue in g++ builds (#228) * Try to use shared_ptr to avoid memory issues * Run tests * Update single include * Also added test for empty file SEGFAULT * diD yoU forGet to #incLude --- .github/workflows/cmake-multi-platform.yml | 2 +- include/csv.hpp | 2 +- include/internal/csv_row.cpp | 5 - include/internal/csv_row.hpp | 8 - single_include/csv.hpp | 1831 ++++++++++---------- single_include_test/csv.hpp | 1831 ++++++++++---------- tests/data | 2 +- tests/test_csv_stat.cpp | 17 + 8 files changed, 1838 insertions(+), 1860 deletions(-) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index b00632be..8da957d5 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -4,7 +4,7 @@ name: CMake on multiple platforms on: push: - branches: [ "master", "gcc-error" ] + branches: [ "master", "memory-fix-gcc" ] pull_request: branches: [ "master" ] diff --git a/include/csv.hpp b/include/csv.hpp index b257a2bc..5bcee1b6 100644 --- a/include/csv.hpp +++ b/include/csv.hpp @@ -1,5 +1,5 @@ /* -CSV for C++, version 2.2.1 +CSV for C++, version 2.2.2 https://github.com/vincentlaucsb/csv-parser MIT License diff --git a/include/internal/csv_row.cpp b/include/internal/csv_row.cpp index a8ac0417..128b1d94 100644 --- a/include/internal/csv_row.cpp +++ b/include/internal/csv_row.cpp @@ -204,12 +204,7 @@ namespace csv { } CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - #ifdef _MSC_BUILD return this->field; - #else - return this->field.get(); - #endif } CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index 519cd239..5f46a6dc 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -337,15 +337,7 @@ namespace csv { #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVField; using difference_type = int; - - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - // but using shared_ptr as pointer type won't compile in g++ -#ifdef _MSC_BUILD using pointer = std::shared_ptr; -#else - using pointer = CSVField * ; -#endif - using reference = CSVField & ; using iterator_category = std::random_access_iterator_tag; #endif diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 4e768eea..037c9225 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -1,6 +1,6 @@ #pragma once /* -CSV for C++, version 2.2.1 +CSV for C++, version 2.2.2 https://github.com/vincentlaucsb/csv-parser MIT License @@ -5723,15 +5723,7 @@ namespace csv { #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVField; using difference_type = int; - - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - // but using shared_ptr as pointer type won't compile in g++ -#ifdef _MSC_BUILD using pointer = std::shared_ptr; -#else - using pointer = CSVField * ; -#endif - using reference = CSVField & ; using iterator_category = std::random_access_iterator_tag; #endif @@ -6927,347 +6919,470 @@ namespace csv { ///@} } -/** @file - * Implements JSON serialization abilities - */ - namespace csv { - /* - The implementations for json_extra_space() and json_escape_string() - were modified from source code for JSON for Modern C++. + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); - The respective license is below: + return end - start; + } - The code is licensed under the [MIT - License](http://opensource.org/licenses/MIT): - - Copyright © 2013-2015 Niels Lohmann. - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation files - (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - */ + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); + } - namespace internals { - /*! - @brief calculates the extra space to escape a JSON string + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; - @param[in] s the string to escape - @return the number of characters required to escape string @a s + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); - @complexity Linear in the length of string @a s. - */ - static std::size_t json_extra_space(csv::string_view& s) noexcept - { - std::size_t result = 0; + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } + return std::string(mmap.begin(), mmap.end()); + } - for (const auto& c : s) - { - switch (c) - { - case '"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - { - // from c (1 byte) to \x (2 bytes) - result += 1; - break; - } +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); + } + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); + } + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); + } - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // from c (1 byte) to \uxxxx (6 bytes) - result += 5; - } - break; - } - } - } + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER; - return result; + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } + + // Push row + if (this->current_row.size() > 0) + this->push_row(); } - CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept - { - const auto space = json_extra_space(s); - if (space == 0) - { - return std::string(s); - } + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; - // create a result string of necessary size - size_t result_size = s.size() + space; - std::string result(result_size, '\\'); - std::size_t pos = 0; + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; - for (const auto& c : s) - { - switch (c) - { - // quotation mark (0x22) - case '"': - { - result[pos + 1] = '"'; - pos += 2; - break; - } + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; - // reverse solidus (0x5c) - case '\\': - { - // nothing to change - pos += 2; - break; - } + field_length = data_pos - (field_start + current_row_start()); + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; + } - // backspace (0x08) - case '\b': - { - result[pos + 1] = 'b'; - pos += 2; - break; - } + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } - // formfeed (0x0c) - case '\f': - { - result[pos + 1] = 'f'; - pos += 2; - break; - } + current_row.row_length++; + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; + } - // newline (0x0a) - case '\n': - { - result[pos + 1] = 'n'; - pos += 2; - break; - } + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); - // carriage return (0x0d) - case '\r': - { - result[pos + 1] = 'r'; - pos += 2; + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; break; - } + case ParseFlags::NEWLINE: + this->data_pos++; - // horizontal tab (0x09) - case '\t': - { - result[pos + 1] = 't'; - pos += 2; - break; - } + // Catches CRLF (or LFLF) + if (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; + // End of record -> Write record + this->push_field(); + this->push_row(); - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // print character c as \uxxxx - snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); - pos += 6; - // overwrite trailing null character - result[pos] = '\\'; + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; + + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; + + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } } - else - { - // all other characters are added as-is - result[pos++] = c; + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; + + break; + + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; } + + // Case: Unescaped quote + this->field_length++; + data_pos++; + break; } - } } - return result; + return this->current_row_start(); } - } - /** Convert a CSV row to a JSON object, i.e. - * `{"col1":"value1","col2":"value2"}` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for original columns. - */ - CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) { - col_names = this->data ? this->get_col_names() : std::vector({}); + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); } - const size_t _n_cols = col_names.size(); - std::string ret = "{"; - - for (size_t i = 0; i < _n_cols; i++) { - auto& col = col_names[i]; - auto field = this->operator[](col); + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } - // TODO: Possible performance enhancements by caching escaped column names - ret += '"' + internals::json_escape_string(col) + "\":"; + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; + this->unicode_bom_scan = true; + } } +#ifdef _MSC_VER +#pragma endregion +#endif - ret += '}'; - return ret; - } +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); - /** Convert a CSV row to a JSON array, i.e. - * `["value1","value2",...]` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for all columns. - */ - CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) - col_names = this->data ? this->get_col_names() : std::vector({}); + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; - const size_t _n_cols = col_names.size(); - std::string ret = "["; + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); - for (size_t i = 0; i < _n_cols; i++) { - auto field = this->operator[](col_names[i]); + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; - } + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } - ret += ']'; - return ret; + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif } } -/** @file - * @brief Defines functionality needed for basic CSV parsing - */ - namespace csv { namespace internals { - CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { - /** Print a CSV row */ - std::stringstream ret; - for (size_t i = 0; i < row.size(); i++) { - ret << row[i]; - if (i + 1 < row.size()) ret << delim; - else ret << '\n'; - } - ret.flush(); - - return ret.str(); + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; } - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - // Parse the CSV - auto trim_chars = format.get_trim_chars(); - std::stringstream source(head.data()); - RowCollection rows; - - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; - return CSVRow(std::move(rows[format.get_header()])); + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } } - CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; + return CSV_NOT_FOUND; + } - // Parse the CSV - std::stringstream source(head.data()); - RowCollection rows; + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); + } - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); + } +} +/** @file + * Defines an object used to store CSV format settings + */ - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; +#include +#include - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } - } - double final_score = 0; - size_t header_row = 0; +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } - // Final score is equal to the largest - // row size times rows of that size - for (auto& pair : row_tally) { - auto row_size = pair.first; - auto row_count = pair.second; - double score = (double)(row_size * row_count); - if (score > final_score) { + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + + this->header = row; + this->col_names = {}; + return *this; + } + + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); + + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); + } + + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; + + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; + + if (i + 1 < intersection.size()) + err_msg += ", "; + } + + throw std::runtime_error(err_msg + '.'); + } + } +} +/** @file + * @brief Defines functionality needed for basic CSV parsing + */ + + +namespace csv { + namespace internals { + CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { + /** Print a CSV row */ + std::stringstream ret; + for (size_t i = 0; i < row.size(); i++) { + ret << row[i]; + if (i + 1 < row.size()) ret << delim; + else ret << '\n'; + } + ret.flush(); + + return ret.str(); + } + + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { + // Parse the CSV + auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; + + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + return CSVRow(std::move(rows[format.get_header()])); + } + + CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; + + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; + + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; + + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; + + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; + } + } + } + + double final_score = 0; + size_t header_row = 0; + + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { final_score = score; header_row = row_when[row_size]; } @@ -7498,698 +7613,583 @@ namespace csv { } } +/** @file + * Defines an input iterator for csv::CSVReader + */ -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; - } - - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; - - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; - } - } - - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; - return CSV_NOT_FOUND; - } +namespace csv { + /** Return an iterator to the first row in the reader */ + CSV_INLINE CSVReader::iterator CSVReader::begin() { + if (this->records->empty()) { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); + // Still empty => return end iterator + if (this->records->empty()) return this->end(); } + CSVReader::iterator ret(this, this->records->pop_front()); + return ret; } -} -#include -#include - -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(in.data()); - return CSVReader(stream, format); + CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { + return CSVReader::iterator(); } - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); + ///////////////////////// + // CSVReader::iterator // + ///////////////////////// - return parse(in, format); + CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : + daddy(_daddy) { + row = std::move(_row); } - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects + /** Advance the iterator by one row. If this CSVReader has an + * associated file, then the iterator will lazily pull more data from + * that file until the end of file is reached. * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma + * @note This iterator does **not** block the thread responsible for parsing CSV. * */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); + CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } + + return *this; } - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); + /** Post-increment iterator */ + CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { + auto temp = *this; + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } + + return temp; } +} - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } - - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); - - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; - - return info; - } -} /** @file - * Defines an input iterator for csv::CSVReader + * Defines the data type used for storing information about a CSV row */ +#include +#include namespace csv { - /** Return an iterator to the first row in the reader */ - CSV_INLINE CSVReader::iterator CSVReader::begin() { - if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); - - // Still empty => return end iterator - if (this->records->empty()) return this->end(); + namespace internals { + CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { + const size_t page_no = n / _single_buffer_capacity; + const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; + return this->buffers[page_no][buffer_idx]; } - CSVReader::iterator ret(this, this->records->pop_front()); - return ret; + CSV_INLINE void CSVFieldList::allocate() { + RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; + buffers.push_back(buffer); + _current_buffer_size = 0; + _back = &(buffers.back()[0]); + } } - /** A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. + /** Return a CSVField object corrsponding to the nth value in the row. + * + * @note This method performs bounds checking, and will throw an + * `std::runtime_error` if n is invalid. + * + * @complexity + * Constant, by calling csv::CSVRow::get_csv::string_view() + * */ - CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { - return CSVReader::iterator(); - } - - ///////////////////////// - // CSVReader::iterator // - ///////////////////////// - - CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : - daddy(_daddy) { - row = std::move(_row); + CSV_INLINE CSVField CSVRow::operator[](size_t n) const { + return CSVField(this->get_field(n)); } - /** Advance the iterator by one row. If this CSVReader has an - * associated file, then the iterator will lazily pull more data from - * that file until the end of file is reached. + /** Retrieve a value by its associated column name. If the column + * specified can't be round, a runtime error is thrown. * - * @note This iterator does **not** block the thread responsible for parsing CSV. + * @complexity + * Constant. This calls the other CSVRow::operator[]() after + * converting column names into indices using a hash table. * + * @param[in] col_name The column to look for */ - CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() + CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { + auto & col_names = this->data->col_names; + auto col_pos = col_names->index_of(col_name); + if (col_pos > -1) { + return this->operator[](col_pos); } - return *this; + throw std::runtime_error("Can't find a column named " + col_name); } - /** Post-increment iterator */ - CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { - auto temp = *this; - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } + CSV_INLINE CSVRow::operator std::vector() const { + std::vector ret; + for (size_t i = 0; i < size(); i++) + ret.push_back(std::string(this->get_field(i))); - return temp; + return ret; } -} - - -namespace csv { - namespace internals { - CSV_INLINE size_t get_file_size(csv::string_view filename) { - std::ifstream infile(std::string(filename), std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - return end - start; - } + CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const + { + using internals::ParseFlags; - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - return get_csv_head(filename, get_file_size(filename)); - } + if (index >= this->size()) + throw std::runtime_error("Index out of bounds."); - CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { - const size_t bytes = 500000; + const size_t field_index = this->fields_start + index; + auto& field = this->data->fields[field_index]; + auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - std::error_code error; - size_t length = std::min((size_t)file_size, bytes); - auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + if (field.has_double_quote) { + auto& value = this->data->double_quote_fields[field_index]; + if (value.empty()) { + bool prev_ch_quote = false; + for (size_t i = 0; i < field.length; i++) { + if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { + if (prev_ch_quote) { + prev_ch_quote = false; + continue; + } + else { + prev_ch_quote = true; + } + } - if (error) { - throw std::runtime_error("Cannot open file " + std::string(filename)); + value += field_str[i]; + } } - return std::string(mmap.begin(), mmap.end()); + return csv::string_view(value); } -#ifdef _MSC_VER -#pragma region IBasicCVParser -#endif - CSV_INLINE IBasicCSVParser::IBasicCSVParser( - const CSVFormat& format, - const ColNamesPtr& col_names - ) : _col_names(col_names) { - if (format.no_quote) { - _parse_flags = internals::make_parse_flags(format.get_delim()); - } - else { - _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); - } + return field_str.substr(0, field.length); + } - _ws_flags = internals::make_ws_flags( - format.trim_chars.data(), format.trim_chars.size() - ); - } + CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { + size_t start = 0, end = 0; - CSV_INLINE void IBasicCSVParser::end_feed() { - using internals::ParseFlags; + // Trim out whitespace chars + for (; start < this->sv.size() && this->sv[start] == ' '; start++); + for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); + + int value_ = 0; - bool empty_last_field = this->data_ptr - && this->data_ptr->_data - && !this->data_ptr->data.empty() - && parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER; + size_t digits = (end - start); + size_t base16_exponent = digits - 1; - // Push field - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } + if (digits == 0) return false; - // Push row - if (this->current_row.size() > 0) - this->push_row(); - } + for (const auto& ch : this->sv.substr(start, digits)) { + int digit = 0; - CSV_INLINE void IBasicCSVParser::parse_field() noexcept { - using internals::ParseFlags; - auto& in = this->data_ptr->data; + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digit = static_cast(ch - '0'); + break; + case 'a': + case 'A': + digit = 10; + break; + case 'b': + case 'B': + digit = 11; + break; + case 'c': + case 'C': + digit = 12; + break; + case 'd': + case 'D': + digit = 13; + break; + case 'e': + case 'E': + digit = 14; + break; + case 'f': + case 'F': + digit = 15; + break; + default: + return false; + } - // Trim off leading whitespace - while (data_pos < in.size() && ws_flag(in[data_pos])) - data_pos++; + value_ += digit * (int)pow(16, (double)base16_exponent); + base16_exponent--; + } - if (field_start == UNINITIALIZED_FIELD) - field_start = (int)(data_pos - current_row_start()); + parsedValue = value_; + return true; + } - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) - data_pos++; - - field_length = data_pos - (field_start + current_row_start()); - - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) - this->field_length--; - } - - CSV_INLINE void IBasicCSVParser::push_field() - { - // Update - if (field_has_double_quote) { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length, - true - ); - field_has_double_quote = false; - - } - else { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length - ); - } - - current_row.row_length++; - - // Reset field state - field_start = UNINITIALIZED_FIELD; - field_length = 0; - } - - /** @return The number of characters parsed that belong to complete rows */ - CSV_INLINE size_t IBasicCSVParser::parse() - { - using internals::ParseFlags; - - this->quote_escape = false; - this->data_pos = 0; - this->current_row_start() = 0; - this->trim_utf8_bom(); - - auto& in = this->data_ptr->data; - while (this->data_pos < in.size()) { - switch (compound_parse_flag(in[this->data_pos])) { - case ParseFlags::DELIMITER: - this->push_field(); - this->data_pos++; - break; - - case ParseFlags::NEWLINE: - this->data_pos++; - - // Catches CRLF (or LFLF) - if (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) - this->data_pos++; - - // End of record -> Write record - this->push_field(); - this->push_row(); - - // Reset - this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); - break; - - case ParseFlags::NOT_SPECIAL: - this->parse_field(); - break; +#ifdef _MSC_VER +#pragma region CSVRow Iterator +#endif + /** Return an iterator pointing to the first field. */ + CSV_INLINE CSVRow::iterator CSVRow::begin() const { + return CSVRow::iterator(this, 0); + } - case ParseFlags::QUOTE_ESCAPE_QUOTE: - if (data_pos + 1 == in.size()) return this->current_row_start(); - else if (data_pos + 1 < in.size()) { - auto next_ch = parse_flag(in[data_pos + 1]); - if (next_ch >= ParseFlags::DELIMITER) { - quote_escape = false; - data_pos++; - break; - } - else if (next_ch == ParseFlags::QUOTE) { - // Case: Escaped quote - data_pos += 2; - this->field_length += 2; - this->field_has_double_quote = true; - break; - } - } - - // Case: Unescaped single quote => not strictly valid but we'll keep it - this->field_length++; - data_pos++; + /** Return an iterator pointing to just after the end of the CSVRow. + * + * @warning Attempting to dereference the end iterator results + * in dereferencing a null pointer. + */ + CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { + return CSVRow::iterator(this, (int)this->size()); + } - break; + CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { + return std::reverse_iterator(this->end()); + } - default: // Quote (currently not quote escaped) - if (this->field_length == 0) { - quote_escape = true; - data_pos++; - if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) - field_start = (int)(data_pos - current_row_start()); - break; - } + CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { + return std::reverse_iterator(this->begin()); + } - // Case: Unescaped quote - this->field_length++; - data_pos++; + CSV_INLINE HEDLEY_NON_NULL(2) + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) + : daddy(_reader), i(_i) { + if (_i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](_i)); + else + this->field = nullptr; + } - break; - } - } + CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { + return *(this->field.get()); + } - return this->current_row_start(); - } + CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { + return this->field; + } - CSV_INLINE void IBasicCSVParser::push_row() { - current_row.row_length = fields->size() - current_row.fields_start; - this->_records->push_back(std::move(current_row)); - } + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { + // Pre-increment operator + this->i++; + if (this->i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](i)); + else // Reached the end of row + this->field = nullptr; + return *this; + } - CSV_INLINE void IBasicCSVParser::reset_data_ptr() { - this->data_ptr = std::make_shared(); - this->data_ptr->parse_flags = this->_parse_flags; - this->data_ptr->col_names = this->_col_names; - this->fields = &(this->data_ptr->fields); - } + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { + // Post-increment operator + auto temp = *this; + this->operator++(); + return temp; + } - CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { - auto& data = this->data_ptr->data; + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { + // Pre-decrement operator + this->i--; + this->field = std::make_shared( + this->daddy->operator[](this->i)); + return *this; + } - if (!this->unicode_bom_scan && data.size() >= 3) { - if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { - this->data_pos += 3; // Remove BOM from input string - this->_utf8_bom = true; - } + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { + // Post-decrement operator + auto temp = *this; + this->operator--(); + return temp; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator(this->daddy, i + (int)n); + } - this->unicode_bom_scan = true; - } - } + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator::operator+(-n); + } #ifdef _MSC_VER -#pragma endregion +#pragma endregion CSVRow Iterator #endif +} -#ifdef _MSC_VER -#pragma region Specializations -#endif - CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { - // Reset parser state - this->field_start = UNINITIALIZED_FIELD; - this->field_length = 0; - this->reset_data_ptr(); +/** @file + * Implements JSON serialization abilities + */ - // Create memory map - size_t length = std::min(this->source_size - this->mmap_pos, bytes); - std::error_code error; - this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); - this->mmap_pos += length; - if (error) throw error; - auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); +namespace csv { + /* + The implementations for json_extra_space() and json_escape_string() + were modified from source code for JSON for Modern C++. - // Create string view - this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + The respective license is below: - // Parse - this->current_row = CSVRow(this->data_ptr); - size_t remainder = this->parse(); + The code is licensed under the [MIT + License](http://opensource.org/licenses/MIT): + + Copyright © 2013-2015 Niels Lohmann. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + */ - if (this->mmap_pos == this->source_size || no_chunk()) { - this->_eof = true; - this->end_feed(); - } + namespace internals { + /*! + @brief calculates the extra space to escape a JSON string - this->mmap_pos -= (length - remainder); - } -#ifdef _MSC_VER -#pragma endregion -#endif - } -} + @param[in] s the string to escape + @return the number of characters required to escape string @a s + + @complexity Linear in the length of string @a s. + */ + static std::size_t json_extra_space(csv::string_view& s) noexcept + { + std::size_t result = 0; -/** @file - * Defines the data type used for storing information about a CSV row - */ -#include -#include + for (const auto& c : s) + { + switch (c) + { + case '"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + { + // from c (1 byte) to \x (2 bytes) + result += 1; + break; + } -namespace csv { - namespace internals { - CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { - const size_t page_no = n / _single_buffer_capacity; - const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; - return this->buffers[page_no][buffer_idx]; - } - CSV_INLINE void CSVFieldList::allocate() { - RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; - buffers.push_back(buffer); - _current_buffer_size = 0; - _back = &(buffers.back()[0]); - } - } + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // from c (1 byte) to \uxxxx (6 bytes) + result += 5; + } + break; + } + } + } - /** Return a CSVField object corrsponding to the nth value in the row. - * - * @note This method performs bounds checking, and will throw an - * `std::runtime_error` if n is invalid. - * - * @complexity - * Constant, by calling csv::CSVRow::get_csv::string_view() - * - */ - CSV_INLINE CSVField CSVRow::operator[](size_t n) const { - return CSVField(this->get_field(n)); - } - /** Retrieve a value by its associated column name. If the column - * specified can't be round, a runtime error is thrown. - * - * @complexity - * Constant. This calls the other CSVRow::operator[]() after - * converting column names into indices using a hash table. - * - * @param[in] col_name The column to look for - */ - CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { - auto & col_names = this->data->col_names; - auto col_pos = col_names->index_of(col_name); - if (col_pos > -1) { - return this->operator[](col_pos); + return result; } - throw std::runtime_error("Can't find a column named " + col_name); - } + CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept + { + const auto space = json_extra_space(s); + if (space == 0) + { + return std::string(s); + } - CSV_INLINE CSVRow::operator std::vector() const { - std::vector ret; - for (size_t i = 0; i < size(); i++) - ret.push_back(std::string(this->get_field(i))); + // create a result string of necessary size + size_t result_size = s.size() + space; + std::string result(result_size, '\\'); + std::size_t pos = 0; - return ret; - } + for (const auto& c : s) + { + switch (c) + { + // quotation mark (0x22) + case '"': + { + result[pos + 1] = '"'; + pos += 2; + break; + } - CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const - { - using internals::ParseFlags; - if (index >= this->size()) - throw std::runtime_error("Index out of bounds."); + // reverse solidus (0x5c) + case '\\': + { + // nothing to change + pos += 2; + break; + } - const size_t field_index = this->fields_start + index; - auto& field = this->data->fields[field_index]; - auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - if (field.has_double_quote) { - auto& value = this->data->double_quote_fields[field_index]; - if (value.empty()) { - bool prev_ch_quote = false; - for (size_t i = 0; i < field.length; i++) { - if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { - if (prev_ch_quote) { - prev_ch_quote = false; - continue; - } - else { - prev_ch_quote = true; - } - } + // backspace (0x08) + case '\b': + { + result[pos + 1] = 'b'; + pos += 2; + break; + } - value += field_str[i]; + + // formfeed (0x0c) + case '\f': + { + result[pos + 1] = 'f'; + pos += 2; + break; } - } - return csv::string_view(value); - } - return field_str.substr(0, field.length); - } + // newline (0x0a) + case '\n': + { + result[pos + 1] = 'n'; + pos += 2; + break; + } - CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { - size_t start = 0, end = 0; - // Trim out whitespace chars - for (; start < this->sv.size() && this->sv[start] == ' '; start++); - for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - - int value_ = 0; + // carriage return (0x0d) + case '\r': + { + result[pos + 1] = 'r'; + pos += 2; + break; + } - size_t digits = (end - start); - size_t base16_exponent = digits - 1; - if (digits == 0) return false; + // horizontal tab (0x09) + case '\t': + { + result[pos + 1] = 't'; + pos += 2; + break; + } - for (const auto& ch : this->sv.substr(start, digits)) { - int digit = 0; - switch (ch) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - digit = static_cast(ch - '0'); - break; - case 'a': - case 'A': - digit = 10; - break; - case 'b': - case 'B': - digit = 11; - break; - case 'c': - case 'C': - digit = 12; - break; - case 'd': - case 'D': - digit = 13; - break; - case 'e': - case 'E': - digit = 14; - break; - case 'f': - case 'F': - digit = 15; - break; - default: - return false; + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // print character c as \uxxxx + snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); + pos += 6; + // overwrite trailing null character + result[pos] = '\\'; + } + else + { + // all other characters are added as-is + result[pos++] = c; + } + break; + } + } } - value_ += digit * (int)pow(16, (double)base16_exponent); - base16_exponent--; + return result; } - - parsedValue = value_; - return true; - } - -#ifdef _MSC_VER -#pragma region CSVRow Iterator -#endif - /** Return an iterator pointing to the first field. */ - CSV_INLINE CSVRow::iterator CSVRow::begin() const { - return CSVRow::iterator(this, 0); } - /** Return an iterator pointing to just after the end of the CSVRow. + /** Convert a CSV row to a JSON object, i.e. + * `{"col1":"value1","col2":"value2"}` * - * @warning Attempting to dereference the end iterator results - * in dereferencing a null pointer. + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for original columns. */ - CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { - return CSVRow::iterator(this, (int)this->size()); - } + CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) { + col_names = this->data ? this->get_col_names() : std::vector({}); + } - CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { - return std::reverse_iterator(this->end()); - } + const size_t _n_cols = col_names.size(); + std::string ret = "{"; + + for (size_t i = 0; i < _n_cols; i++) { + auto& col = col_names[i]; + auto field = this->operator[](col); - CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { - return std::reverse_iterator(this->begin()); - } + // TODO: Possible performance enhancements by caching escaped column names + ret += '"' + internals::json_escape_string(col) + "\":"; - CSV_INLINE HEDLEY_NON_NULL(2) - CSVRow::iterator::iterator(const CSVRow* _reader, int _i) - : daddy(_reader), i(_i) { - if (_i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](_i)); - else - this->field = nullptr; - } + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; - CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { - return *(this->field.get()); - } + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; + } - CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - #ifdef _MSC_BUILD - return this->field; - #else - return this->field.get(); - #endif + ret += '}'; + return ret; } - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { - // Pre-increment operator - this->i++; - if (this->i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](i)); - else // Reached the end of row - this->field = nullptr; - return *this; - } + /** Convert a CSV row to a JSON array, i.e. + * `["value1","value2",...]` + * + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for all columns. + */ + CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) + col_names = this->data ? this->get_col_names() : std::vector({}); - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { - // Post-increment operator - auto temp = *this; - this->operator++(); - return temp; - } + const size_t _n_cols = col_names.size(); + std::string ret = "["; - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { - // Pre-decrement operator - this->i--; - this->field = std::make_shared( - this->daddy->operator[](this->i)); - return *this; - } + for (size_t i = 0; i < _n_cols; i++) { + auto field = this->operator[](col_names[i]); - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { - // Post-decrement operator - auto temp = *this; - this->operator--(); - return temp; - } - - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator(this->daddy, i + (int)n); - } + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator::operator+(-n); + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; + } + + ret += ']'; + return ret; } -#ifdef _MSC_VER -#pragma endregion CSVRow Iterator -#endif } /** @file @@ -8458,95 +8458,82 @@ namespace csv { return csv_dtypes; } } -/** @file - * Defines an object used to store CSV format settings - */ - -#include -#include +#include +#include namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(in.data()); + return CSVReader(stream, format); } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; + return parse(in, format); } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * + */ + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); } - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); } - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - - this->header = row; - this->col_names = {}; - return *this; + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise + * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file + */ + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); } - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); - - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; - - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); - - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); - } - - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; - - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); - if (i + 1 < intersection.size()) - err_msg += ", "; - } + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; - throw std::runtime_error(err_msg + '.'); - } + return info; } } diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index 4e768eea..037c9225 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -1,6 +1,6 @@ #pragma once /* -CSV for C++, version 2.2.1 +CSV for C++, version 2.2.2 https://github.com/vincentlaucsb/csv-parser MIT License @@ -5723,15 +5723,7 @@ namespace csv { #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVField; using difference_type = int; - - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - // but using shared_ptr as pointer type won't compile in g++ -#ifdef _MSC_BUILD using pointer = std::shared_ptr; -#else - using pointer = CSVField * ; -#endif - using reference = CSVField & ; using iterator_category = std::random_access_iterator_tag; #endif @@ -6927,347 +6919,470 @@ namespace csv { ///@} } -/** @file - * Implements JSON serialization abilities - */ - namespace csv { - /* - The implementations for json_extra_space() and json_escape_string() - were modified from source code for JSON for Modern C++. + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); - The respective license is below: + return end - start; + } - The code is licensed under the [MIT - License](http://opensource.org/licenses/MIT): - - Copyright © 2013-2015 Niels Lohmann. - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation files - (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - */ + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); + } - namespace internals { - /*! - @brief calculates the extra space to escape a JSON string + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; - @param[in] s the string to escape - @return the number of characters required to escape string @a s + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); - @complexity Linear in the length of string @a s. - */ - static std::size_t json_extra_space(csv::string_view& s) noexcept - { - std::size_t result = 0; + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } + return std::string(mmap.begin(), mmap.end()); + } - for (const auto& c : s) - { - switch (c) - { - case '"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - { - // from c (1 byte) to \x (2 bytes) - result += 1; - break; - } +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); + } + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); + } + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); + } - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // from c (1 byte) to \uxxxx (6 bytes) - result += 5; - } - break; - } - } - } + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER; - return result; + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } + + // Push row + if (this->current_row.size() > 0) + this->push_row(); } - CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept - { - const auto space = json_extra_space(s); - if (space == 0) - { - return std::string(s); - } + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; - // create a result string of necessary size - size_t result_size = s.size() + space; - std::string result(result_size, '\\'); - std::size_t pos = 0; + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; - for (const auto& c : s) - { - switch (c) - { - // quotation mark (0x22) - case '"': - { - result[pos + 1] = '"'; - pos += 2; - break; - } + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; - // reverse solidus (0x5c) - case '\\': - { - // nothing to change - pos += 2; - break; - } + field_length = data_pos - (field_start + current_row_start()); + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; + } - // backspace (0x08) - case '\b': - { - result[pos + 1] = 'b'; - pos += 2; - break; - } + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } - // formfeed (0x0c) - case '\f': - { - result[pos + 1] = 'f'; - pos += 2; - break; - } + current_row.row_length++; + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; + } - // newline (0x0a) - case '\n': - { - result[pos + 1] = 'n'; - pos += 2; - break; - } + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); - // carriage return (0x0d) - case '\r': - { - result[pos + 1] = 'r'; - pos += 2; + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; break; - } + case ParseFlags::NEWLINE: + this->data_pos++; - // horizontal tab (0x09) - case '\t': - { - result[pos + 1] = 't'; - pos += 2; - break; - } + // Catches CRLF (or LFLF) + if (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; + // End of record -> Write record + this->push_field(); + this->push_row(); - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // print character c as \uxxxx - snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); - pos += 6; - // overwrite trailing null character - result[pos] = '\\'; + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; + + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; + + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } } - else - { - // all other characters are added as-is - result[pos++] = c; + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; + + break; + + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; } + + // Case: Unescaped quote + this->field_length++; + data_pos++; + break; } - } } - return result; + return this->current_row_start(); } - } - /** Convert a CSV row to a JSON object, i.e. - * `{"col1":"value1","col2":"value2"}` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for original columns. - */ - CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) { - col_names = this->data ? this->get_col_names() : std::vector({}); + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); } - const size_t _n_cols = col_names.size(); - std::string ret = "{"; - - for (size_t i = 0; i < _n_cols; i++) { - auto& col = col_names[i]; - auto field = this->operator[](col); + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } - // TODO: Possible performance enhancements by caching escaped column names - ret += '"' + internals::json_escape_string(col) + "\":"; + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; + this->unicode_bom_scan = true; + } } +#ifdef _MSC_VER +#pragma endregion +#endif - ret += '}'; - return ret; - } +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); - /** Convert a CSV row to a JSON array, i.e. - * `["value1","value2",...]` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for all columns. - */ - CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) - col_names = this->data ? this->get_col_names() : std::vector({}); + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; - const size_t _n_cols = col_names.size(); - std::string ret = "["; + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); - for (size_t i = 0; i < _n_cols; i++) { - auto field = this->operator[](col_names[i]); + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; - } + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } - ret += ']'; - return ret; + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif } } -/** @file - * @brief Defines functionality needed for basic CSV parsing - */ - namespace csv { namespace internals { - CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { - /** Print a CSV row */ - std::stringstream ret; - for (size_t i = 0; i < row.size(); i++) { - ret << row[i]; - if (i + 1 < row.size()) ret << delim; - else ret << '\n'; - } - ret.flush(); - - return ret.str(); + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; } - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - // Parse the CSV - auto trim_chars = format.get_trim_chars(); - std::stringstream source(head.data()); - RowCollection rows; - - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; - return CSVRow(std::move(rows[format.get_header()])); + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } } - CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; + return CSV_NOT_FOUND; + } - // Parse the CSV - std::stringstream source(head.data()); - RowCollection rows; + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); + } - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); + } +} +/** @file + * Defines an object used to store CSV format settings + */ - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; +#include +#include - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } - } - double final_score = 0; - size_t header_row = 0; +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } - // Final score is equal to the largest - // row size times rows of that size - for (auto& pair : row_tally) { - auto row_size = pair.first; - auto row_count = pair.second; - double score = (double)(row_size * row_count); - if (score > final_score) { + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + + this->header = row; + this->col_names = {}; + return *this; + } + + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); + + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); + } + + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; + + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; + + if (i + 1 < intersection.size()) + err_msg += ", "; + } + + throw std::runtime_error(err_msg + '.'); + } + } +} +/** @file + * @brief Defines functionality needed for basic CSV parsing + */ + + +namespace csv { + namespace internals { + CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { + /** Print a CSV row */ + std::stringstream ret; + for (size_t i = 0; i < row.size(); i++) { + ret << row[i]; + if (i + 1 < row.size()) ret << delim; + else ret << '\n'; + } + ret.flush(); + + return ret.str(); + } + + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { + // Parse the CSV + auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; + + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + return CSVRow(std::move(rows[format.get_header()])); + } + + CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; + + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; + + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; + + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; + + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; + } + } + } + + double final_score = 0; + size_t header_row = 0; + + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { final_score = score; header_row = row_when[row_size]; } @@ -7498,698 +7613,583 @@ namespace csv { } } +/** @file + * Defines an input iterator for csv::CSVReader + */ -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; - } - - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; - - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; - } - } - - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; - return CSV_NOT_FOUND; - } +namespace csv { + /** Return an iterator to the first row in the reader */ + CSV_INLINE CSVReader::iterator CSVReader::begin() { + if (this->records->empty()) { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); + // Still empty => return end iterator + if (this->records->empty()) return this->end(); } + CSVReader::iterator ret(this, this->records->pop_front()); + return ret; } -} -#include -#include - -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(in.data()); - return CSVReader(stream, format); + CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { + return CSVReader::iterator(); } - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); + ///////////////////////// + // CSVReader::iterator // + ///////////////////////// - return parse(in, format); + CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : + daddy(_daddy) { + row = std::move(_row); } - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects + /** Advance the iterator by one row. If this CSVReader has an + * associated file, then the iterator will lazily pull more data from + * that file until the end of file is reached. * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma + * @note This iterator does **not** block the thread responsible for parsing CSV. * */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); + CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } + + return *this; } - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); + /** Post-increment iterator */ + CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { + auto temp = *this; + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } + + return temp; } +} - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } - - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); - - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; - - return info; - } -} /** @file - * Defines an input iterator for csv::CSVReader + * Defines the data type used for storing information about a CSV row */ +#include +#include namespace csv { - /** Return an iterator to the first row in the reader */ - CSV_INLINE CSVReader::iterator CSVReader::begin() { - if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); - - // Still empty => return end iterator - if (this->records->empty()) return this->end(); + namespace internals { + CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { + const size_t page_no = n / _single_buffer_capacity; + const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; + return this->buffers[page_no][buffer_idx]; } - CSVReader::iterator ret(this, this->records->pop_front()); - return ret; + CSV_INLINE void CSVFieldList::allocate() { + RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; + buffers.push_back(buffer); + _current_buffer_size = 0; + _back = &(buffers.back()[0]); + } } - /** A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. + /** Return a CSVField object corrsponding to the nth value in the row. + * + * @note This method performs bounds checking, and will throw an + * `std::runtime_error` if n is invalid. + * + * @complexity + * Constant, by calling csv::CSVRow::get_csv::string_view() + * */ - CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { - return CSVReader::iterator(); - } - - ///////////////////////// - // CSVReader::iterator // - ///////////////////////// - - CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : - daddy(_daddy) { - row = std::move(_row); + CSV_INLINE CSVField CSVRow::operator[](size_t n) const { + return CSVField(this->get_field(n)); } - /** Advance the iterator by one row. If this CSVReader has an - * associated file, then the iterator will lazily pull more data from - * that file until the end of file is reached. + /** Retrieve a value by its associated column name. If the column + * specified can't be round, a runtime error is thrown. * - * @note This iterator does **not** block the thread responsible for parsing CSV. + * @complexity + * Constant. This calls the other CSVRow::operator[]() after + * converting column names into indices using a hash table. * + * @param[in] col_name The column to look for */ - CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() + CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { + auto & col_names = this->data->col_names; + auto col_pos = col_names->index_of(col_name); + if (col_pos > -1) { + return this->operator[](col_pos); } - return *this; + throw std::runtime_error("Can't find a column named " + col_name); } - /** Post-increment iterator */ - CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { - auto temp = *this; - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } + CSV_INLINE CSVRow::operator std::vector() const { + std::vector ret; + for (size_t i = 0; i < size(); i++) + ret.push_back(std::string(this->get_field(i))); - return temp; + return ret; } -} - - -namespace csv { - namespace internals { - CSV_INLINE size_t get_file_size(csv::string_view filename) { - std::ifstream infile(std::string(filename), std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - return end - start; - } + CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const + { + using internals::ParseFlags; - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - return get_csv_head(filename, get_file_size(filename)); - } + if (index >= this->size()) + throw std::runtime_error("Index out of bounds."); - CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { - const size_t bytes = 500000; + const size_t field_index = this->fields_start + index; + auto& field = this->data->fields[field_index]; + auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - std::error_code error; - size_t length = std::min((size_t)file_size, bytes); - auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + if (field.has_double_quote) { + auto& value = this->data->double_quote_fields[field_index]; + if (value.empty()) { + bool prev_ch_quote = false; + for (size_t i = 0; i < field.length; i++) { + if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { + if (prev_ch_quote) { + prev_ch_quote = false; + continue; + } + else { + prev_ch_quote = true; + } + } - if (error) { - throw std::runtime_error("Cannot open file " + std::string(filename)); + value += field_str[i]; + } } - return std::string(mmap.begin(), mmap.end()); + return csv::string_view(value); } -#ifdef _MSC_VER -#pragma region IBasicCVParser -#endif - CSV_INLINE IBasicCSVParser::IBasicCSVParser( - const CSVFormat& format, - const ColNamesPtr& col_names - ) : _col_names(col_names) { - if (format.no_quote) { - _parse_flags = internals::make_parse_flags(format.get_delim()); - } - else { - _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); - } + return field_str.substr(0, field.length); + } - _ws_flags = internals::make_ws_flags( - format.trim_chars.data(), format.trim_chars.size() - ); - } + CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { + size_t start = 0, end = 0; - CSV_INLINE void IBasicCSVParser::end_feed() { - using internals::ParseFlags; + // Trim out whitespace chars + for (; start < this->sv.size() && this->sv[start] == ' '; start++); + for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); + + int value_ = 0; - bool empty_last_field = this->data_ptr - && this->data_ptr->_data - && !this->data_ptr->data.empty() - && parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER; + size_t digits = (end - start); + size_t base16_exponent = digits - 1; - // Push field - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } + if (digits == 0) return false; - // Push row - if (this->current_row.size() > 0) - this->push_row(); - } + for (const auto& ch : this->sv.substr(start, digits)) { + int digit = 0; - CSV_INLINE void IBasicCSVParser::parse_field() noexcept { - using internals::ParseFlags; - auto& in = this->data_ptr->data; + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digit = static_cast(ch - '0'); + break; + case 'a': + case 'A': + digit = 10; + break; + case 'b': + case 'B': + digit = 11; + break; + case 'c': + case 'C': + digit = 12; + break; + case 'd': + case 'D': + digit = 13; + break; + case 'e': + case 'E': + digit = 14; + break; + case 'f': + case 'F': + digit = 15; + break; + default: + return false; + } - // Trim off leading whitespace - while (data_pos < in.size() && ws_flag(in[data_pos])) - data_pos++; + value_ += digit * (int)pow(16, (double)base16_exponent); + base16_exponent--; + } - if (field_start == UNINITIALIZED_FIELD) - field_start = (int)(data_pos - current_row_start()); + parsedValue = value_; + return true; + } - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) - data_pos++; - - field_length = data_pos - (field_start + current_row_start()); - - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) - this->field_length--; - } - - CSV_INLINE void IBasicCSVParser::push_field() - { - // Update - if (field_has_double_quote) { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length, - true - ); - field_has_double_quote = false; - - } - else { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length - ); - } - - current_row.row_length++; - - // Reset field state - field_start = UNINITIALIZED_FIELD; - field_length = 0; - } - - /** @return The number of characters parsed that belong to complete rows */ - CSV_INLINE size_t IBasicCSVParser::parse() - { - using internals::ParseFlags; - - this->quote_escape = false; - this->data_pos = 0; - this->current_row_start() = 0; - this->trim_utf8_bom(); - - auto& in = this->data_ptr->data; - while (this->data_pos < in.size()) { - switch (compound_parse_flag(in[this->data_pos])) { - case ParseFlags::DELIMITER: - this->push_field(); - this->data_pos++; - break; - - case ParseFlags::NEWLINE: - this->data_pos++; - - // Catches CRLF (or LFLF) - if (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) - this->data_pos++; - - // End of record -> Write record - this->push_field(); - this->push_row(); - - // Reset - this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); - break; - - case ParseFlags::NOT_SPECIAL: - this->parse_field(); - break; +#ifdef _MSC_VER +#pragma region CSVRow Iterator +#endif + /** Return an iterator pointing to the first field. */ + CSV_INLINE CSVRow::iterator CSVRow::begin() const { + return CSVRow::iterator(this, 0); + } - case ParseFlags::QUOTE_ESCAPE_QUOTE: - if (data_pos + 1 == in.size()) return this->current_row_start(); - else if (data_pos + 1 < in.size()) { - auto next_ch = parse_flag(in[data_pos + 1]); - if (next_ch >= ParseFlags::DELIMITER) { - quote_escape = false; - data_pos++; - break; - } - else if (next_ch == ParseFlags::QUOTE) { - // Case: Escaped quote - data_pos += 2; - this->field_length += 2; - this->field_has_double_quote = true; - break; - } - } - - // Case: Unescaped single quote => not strictly valid but we'll keep it - this->field_length++; - data_pos++; + /** Return an iterator pointing to just after the end of the CSVRow. + * + * @warning Attempting to dereference the end iterator results + * in dereferencing a null pointer. + */ + CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { + return CSVRow::iterator(this, (int)this->size()); + } - break; + CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { + return std::reverse_iterator(this->end()); + } - default: // Quote (currently not quote escaped) - if (this->field_length == 0) { - quote_escape = true; - data_pos++; - if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) - field_start = (int)(data_pos - current_row_start()); - break; - } + CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { + return std::reverse_iterator(this->begin()); + } - // Case: Unescaped quote - this->field_length++; - data_pos++; + CSV_INLINE HEDLEY_NON_NULL(2) + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) + : daddy(_reader), i(_i) { + if (_i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](_i)); + else + this->field = nullptr; + } - break; - } - } + CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { + return *(this->field.get()); + } - return this->current_row_start(); - } + CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { + return this->field; + } - CSV_INLINE void IBasicCSVParser::push_row() { - current_row.row_length = fields->size() - current_row.fields_start; - this->_records->push_back(std::move(current_row)); - } + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { + // Pre-increment operator + this->i++; + if (this->i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](i)); + else // Reached the end of row + this->field = nullptr; + return *this; + } - CSV_INLINE void IBasicCSVParser::reset_data_ptr() { - this->data_ptr = std::make_shared(); - this->data_ptr->parse_flags = this->_parse_flags; - this->data_ptr->col_names = this->_col_names; - this->fields = &(this->data_ptr->fields); - } + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { + // Post-increment operator + auto temp = *this; + this->operator++(); + return temp; + } - CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { - auto& data = this->data_ptr->data; + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { + // Pre-decrement operator + this->i--; + this->field = std::make_shared( + this->daddy->operator[](this->i)); + return *this; + } - if (!this->unicode_bom_scan && data.size() >= 3) { - if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { - this->data_pos += 3; // Remove BOM from input string - this->_utf8_bom = true; - } + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { + // Post-decrement operator + auto temp = *this; + this->operator--(); + return temp; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator(this->daddy, i + (int)n); + } - this->unicode_bom_scan = true; - } - } + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator::operator+(-n); + } #ifdef _MSC_VER -#pragma endregion +#pragma endregion CSVRow Iterator #endif +} -#ifdef _MSC_VER -#pragma region Specializations -#endif - CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { - // Reset parser state - this->field_start = UNINITIALIZED_FIELD; - this->field_length = 0; - this->reset_data_ptr(); +/** @file + * Implements JSON serialization abilities + */ - // Create memory map - size_t length = std::min(this->source_size - this->mmap_pos, bytes); - std::error_code error; - this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); - this->mmap_pos += length; - if (error) throw error; - auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); +namespace csv { + /* + The implementations for json_extra_space() and json_escape_string() + were modified from source code for JSON for Modern C++. - // Create string view - this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + The respective license is below: - // Parse - this->current_row = CSVRow(this->data_ptr); - size_t remainder = this->parse(); + The code is licensed under the [MIT + License](http://opensource.org/licenses/MIT): + + Copyright © 2013-2015 Niels Lohmann. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + */ - if (this->mmap_pos == this->source_size || no_chunk()) { - this->_eof = true; - this->end_feed(); - } + namespace internals { + /*! + @brief calculates the extra space to escape a JSON string - this->mmap_pos -= (length - remainder); - } -#ifdef _MSC_VER -#pragma endregion -#endif - } -} + @param[in] s the string to escape + @return the number of characters required to escape string @a s + + @complexity Linear in the length of string @a s. + */ + static std::size_t json_extra_space(csv::string_view& s) noexcept + { + std::size_t result = 0; -/** @file - * Defines the data type used for storing information about a CSV row - */ -#include -#include + for (const auto& c : s) + { + switch (c) + { + case '"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + { + // from c (1 byte) to \x (2 bytes) + result += 1; + break; + } -namespace csv { - namespace internals { - CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { - const size_t page_no = n / _single_buffer_capacity; - const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; - return this->buffers[page_no][buffer_idx]; - } - CSV_INLINE void CSVFieldList::allocate() { - RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; - buffers.push_back(buffer); - _current_buffer_size = 0; - _back = &(buffers.back()[0]); - } - } + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // from c (1 byte) to \uxxxx (6 bytes) + result += 5; + } + break; + } + } + } - /** Return a CSVField object corrsponding to the nth value in the row. - * - * @note This method performs bounds checking, and will throw an - * `std::runtime_error` if n is invalid. - * - * @complexity - * Constant, by calling csv::CSVRow::get_csv::string_view() - * - */ - CSV_INLINE CSVField CSVRow::operator[](size_t n) const { - return CSVField(this->get_field(n)); - } - /** Retrieve a value by its associated column name. If the column - * specified can't be round, a runtime error is thrown. - * - * @complexity - * Constant. This calls the other CSVRow::operator[]() after - * converting column names into indices using a hash table. - * - * @param[in] col_name The column to look for - */ - CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { - auto & col_names = this->data->col_names; - auto col_pos = col_names->index_of(col_name); - if (col_pos > -1) { - return this->operator[](col_pos); + return result; } - throw std::runtime_error("Can't find a column named " + col_name); - } + CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept + { + const auto space = json_extra_space(s); + if (space == 0) + { + return std::string(s); + } - CSV_INLINE CSVRow::operator std::vector() const { - std::vector ret; - for (size_t i = 0; i < size(); i++) - ret.push_back(std::string(this->get_field(i))); + // create a result string of necessary size + size_t result_size = s.size() + space; + std::string result(result_size, '\\'); + std::size_t pos = 0; - return ret; - } + for (const auto& c : s) + { + switch (c) + { + // quotation mark (0x22) + case '"': + { + result[pos + 1] = '"'; + pos += 2; + break; + } - CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const - { - using internals::ParseFlags; - if (index >= this->size()) - throw std::runtime_error("Index out of bounds."); + // reverse solidus (0x5c) + case '\\': + { + // nothing to change + pos += 2; + break; + } - const size_t field_index = this->fields_start + index; - auto& field = this->data->fields[field_index]; - auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - if (field.has_double_quote) { - auto& value = this->data->double_quote_fields[field_index]; - if (value.empty()) { - bool prev_ch_quote = false; - for (size_t i = 0; i < field.length; i++) { - if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { - if (prev_ch_quote) { - prev_ch_quote = false; - continue; - } - else { - prev_ch_quote = true; - } - } + // backspace (0x08) + case '\b': + { + result[pos + 1] = 'b'; + pos += 2; + break; + } - value += field_str[i]; + + // formfeed (0x0c) + case '\f': + { + result[pos + 1] = 'f'; + pos += 2; + break; } - } - return csv::string_view(value); - } - return field_str.substr(0, field.length); - } + // newline (0x0a) + case '\n': + { + result[pos + 1] = 'n'; + pos += 2; + break; + } - CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { - size_t start = 0, end = 0; - // Trim out whitespace chars - for (; start < this->sv.size() && this->sv[start] == ' '; start++); - for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - - int value_ = 0; + // carriage return (0x0d) + case '\r': + { + result[pos + 1] = 'r'; + pos += 2; + break; + } - size_t digits = (end - start); - size_t base16_exponent = digits - 1; - if (digits == 0) return false; + // horizontal tab (0x09) + case '\t': + { + result[pos + 1] = 't'; + pos += 2; + break; + } - for (const auto& ch : this->sv.substr(start, digits)) { - int digit = 0; - switch (ch) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - digit = static_cast(ch - '0'); - break; - case 'a': - case 'A': - digit = 10; - break; - case 'b': - case 'B': - digit = 11; - break; - case 'c': - case 'C': - digit = 12; - break; - case 'd': - case 'D': - digit = 13; - break; - case 'e': - case 'E': - digit = 14; - break; - case 'f': - case 'F': - digit = 15; - break; - default: - return false; + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // print character c as \uxxxx + snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); + pos += 6; + // overwrite trailing null character + result[pos] = '\\'; + } + else + { + // all other characters are added as-is + result[pos++] = c; + } + break; + } + } } - value_ += digit * (int)pow(16, (double)base16_exponent); - base16_exponent--; + return result; } - - parsedValue = value_; - return true; - } - -#ifdef _MSC_VER -#pragma region CSVRow Iterator -#endif - /** Return an iterator pointing to the first field. */ - CSV_INLINE CSVRow::iterator CSVRow::begin() const { - return CSVRow::iterator(this, 0); } - /** Return an iterator pointing to just after the end of the CSVRow. + /** Convert a CSV row to a JSON object, i.e. + * `{"col1":"value1","col2":"value2"}` * - * @warning Attempting to dereference the end iterator results - * in dereferencing a null pointer. + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for original columns. */ - CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { - return CSVRow::iterator(this, (int)this->size()); - } + CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) { + col_names = this->data ? this->get_col_names() : std::vector({}); + } - CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { - return std::reverse_iterator(this->end()); - } + const size_t _n_cols = col_names.size(); + std::string ret = "{"; + + for (size_t i = 0; i < _n_cols; i++) { + auto& col = col_names[i]; + auto field = this->operator[](col); - CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { - return std::reverse_iterator(this->begin()); - } + // TODO: Possible performance enhancements by caching escaped column names + ret += '"' + internals::json_escape_string(col) + "\":"; - CSV_INLINE HEDLEY_NON_NULL(2) - CSVRow::iterator::iterator(const CSVRow* _reader, int _i) - : daddy(_reader), i(_i) { - if (_i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](_i)); - else - this->field = nullptr; - } + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; - CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { - return *(this->field.get()); - } + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; + } - CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - // Using CSVField * as pointer type causes segfaults in MSVC debug builds - #ifdef _MSC_BUILD - return this->field; - #else - return this->field.get(); - #endif + ret += '}'; + return ret; } - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { - // Pre-increment operator - this->i++; - if (this->i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](i)); - else // Reached the end of row - this->field = nullptr; - return *this; - } + /** Convert a CSV row to a JSON array, i.e. + * `["value1","value2",...]` + * + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for all columns. + */ + CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) + col_names = this->data ? this->get_col_names() : std::vector({}); - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { - // Post-increment operator - auto temp = *this; - this->operator++(); - return temp; - } + const size_t _n_cols = col_names.size(); + std::string ret = "["; - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { - // Pre-decrement operator - this->i--; - this->field = std::make_shared( - this->daddy->operator[](this->i)); - return *this; - } + for (size_t i = 0; i < _n_cols; i++) { + auto field = this->operator[](col_names[i]); - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { - // Post-decrement operator - auto temp = *this; - this->operator--(); - return temp; - } - - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator(this->daddy, i + (int)n); - } + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator::operator+(-n); + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; + } + + ret += ']'; + return ret; } -#ifdef _MSC_VER -#pragma endregion CSVRow Iterator -#endif } /** @file @@ -8458,95 +8458,82 @@ namespace csv { return csv_dtypes; } } -/** @file - * Defines an object used to store CSV format settings - */ - -#include -#include +#include +#include namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(in.data()); + return CSVReader(stream, format); } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; + return parse(in, format); } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * + */ + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); } - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); } - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - - this->header = row; - this->col_names = {}; - return *this; + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise + * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file + */ + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); } - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); - - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; - - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); - - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); - } - - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; - - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); - if (i + 1 < intersection.size()) - err_msg += ", "; - } + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; - throw std::runtime_error(err_msg + '.'); - } + return info; } } diff --git a/tests/data b/tests/data index 5e94ae8e..b7a848f7 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 5e94ae8e97c5fd447e1c37ca12eb614e2483710e +Subproject commit b7a848f7b18f9cc9a8ab7a38338eaa2525afa136 diff --git a/tests/test_csv_stat.cpp b/tests/test_csv_stat.cpp index 0772bb47..d8551c25 100644 --- a/tests/test_csv_stat.cpp +++ b/tests/test_csv_stat.cpp @@ -1,9 +1,26 @@ +#include #include "catch.hpp" #include "csv.hpp" using namespace csv; const std::string PERSONS_CSV = "./tests/data/mimesis_data/persons.csv"; +// Regression test for #208: Try to parse an empty file shouldn't result in a SEGFAULT +TEST_CASE("Empty File", "[read_csv_stat_empty]") { + bool error_caught = false; + + try { + CSVStat stats("./tests/data/fake_data/empty.csv"); + stats.get_mins(); + } + catch (std::runtime_error& err) { + error_caught = true; + REQUIRE(strcmp(err.what(), "Cannot open file ./tests/data/fake_data/empty.csv") == 0); + } + + REQUIRE(error_caught); +} + TEST_CASE("Calculating Statistics from Direct Input", "[read_csv_stat_direct]" ) { std::string int_str; std::stringstream int_list;