Skip to content

Commit

Permalink
Fixes #304 - n-initial-skip-rows failed with new csv parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Jun 19, 2022
1 parent 174e6c0 commit e0e7ab6
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 8 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
@@ -1,4 +1,7 @@
# Changelog
# 8.089
* CSV parsing now supports `:comment-char` that defaults to #. Lines that begin with this character are ignored.
* Fix for

# 8.087
* Fix pd-merge `:outer` conditional - [issue 302](https://github.com/techascent/tech.ml.dataset/issues/302/)
Expand Down
5 changes: 3 additions & 2 deletions project.clj
@@ -1,12 +1,13 @@
(defproject techascent/tech.ml.dataset "6.089-SNAPSHOT"
(defproject techascent/tech.ml.dataset "6.089"
:description "Clojure high performance data processing system"
:url "http://github.com/techascent/tech.ml.dataset"
:license {:name "Eclipse Public License"
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.clojure/clojure "1.10.3" :scope "provided"]
[cnuernber/dtype-next "9.028"]
[techascent/tech.io "4.13"
[techascent/tech.io "4.15"
:exclusions [org.apache.commons/commons-compress]]
;;[com.cnuernber/charred "1.010"]
[com.univocity/univocity-parsers "2.9.0"]
[org.apache.poi/poi-ooxml "5.1.0"
:exclusions [commons-codec]]
Expand Down
3 changes: 3 additions & 0 deletions src/tech/v3/dataset/io/csv.clj
Expand Up @@ -52,6 +52,9 @@
:as options}
row-seq]
(let [row-iter (pfor/->iterator row-seq)
n-initial-skip-rows (long (get options :n-initial-skip-rows 0))
_ (dotimes [idx n-initial-skip-rows]
(when (.hasNext row-iter) (.next row-iter)))
header-row (if (and header-row? (.hasNext row-iter))
(vec (.next row-iter))
[])]
Expand Down
6 changes: 0 additions & 6 deletions src/tech/v3/dataset/io/univocity.clj
Expand Up @@ -240,12 +240,6 @@
(load-csv data options))


(defn csv->dataset
"Load a csv file into a dataset."
[data options]
(load-csv data options))


(defprotocol PApplyWriteOptions
(apply-write-options! [settings options]))

Expand Down
8 changes: 8 additions & 0 deletions test/data/csv-comment.csv
@@ -0,0 +1,8 @@
# Program:featureCounts v1.6.5; Command:"featureCounts" "-a" "/Refs/NC_003028.gtf" "-o" "/ExpOut/220601_NS500751_0199_AHGMNMBGXL/Out/Rep/Fcnts/CDS/T4-1393NDC180m-a.csv" "-t" "CDS" "-R" "CORE" "/ExpOut/220601_NS500751_0199_AHGMNMBGXL/Out/Rep/Bams/T4-1393NDC180m-a.bam"
Geneid,Chr,Start,End,Strand,Length,/ExpOut/220601_NS500751_0199_AHGMNMBGXL/Out/Rep/Bams/T4-1393NDC180m-a.bam
SP_0001, NC_003028, 197 1558,+, 1362,566
SP_0002, NC_003028, 1717,,2853,+, 1137,603
SP_0003, NC_003028, 2864,,3112,+, 249, 67
SP_#003, #NC_003028,2864,,3112,+, 249, 67
#SP_#003, #NC_003028,2864,,3112,+, 249, 67
SP_#003, #NC_003028,2864,,3112,+, 249, 67
11 changes: 11 additions & 0 deletions test/tech/v3/dataset/parse_test.clj
Expand Up @@ -478,3 +478,14 @@
(let [ds (ds/->dataset "test/data/empty-csv.csv")]
(is (= 0 (ds/column-count ds)))
(is (ds/dataset? ds))))


(deftest comment-char
(let [ds (ds/->dataset "test/data/csv-comment.csv")
rows (ds/rows ds)]
(is (= 5 (ds/row-count ds)))
(is (= (rows -1) (rows -2)))))

(deftest issue-304
(let [ds (ds/->dataset "test/data/issue-292.csv" {:n-initial-skip-rows 10})]
(is (= 11 (-> (ds "10") (first))))))

0 comments on commit e0e7ab6

Please sign in to comment.