Skip to content

Commit

Permalink
Origin commit, functionnalities almost complete, still miss data obje…
Browse files Browse the repository at this point in the history
…cts as an alternative to arrays, and some more source/destination handling
  • Loading branch information
wdavidw committed Sep 25, 2010
0 parents commit c253672
Show file tree
Hide file tree
Showing 38 changed files with 845 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.*
*.tmp
lib-cov
!.gitignore
254 changes: 254 additions & 0 deletions lib/csv.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@

// Module CSV - Copyright David Worms <open@adaltas.com> (MIT Licensed)

var EventEmitter = require('events').EventEmitter,
fs = require('fs');

// Utils function
var merge = function(obj1,obj2){
var r = obj1||{};
for(var key in obj2){
r[key] = obj2[key];
}
return r;
}

module.exports = function(){
var state = {
count: 0,
field: '',
line: [],
lastC: '',
quoted: false,
commented: false,
buffer: null,
bufferPosition: 0
}

// Defined Class

var CSV = function(){
// Set options
this.readOptions = {
flags: 'r',
encoding: 'utf8',
bufferSize: 8 * 1024 * 1024,
separator: ',',
escape: '"',
quote: '"'
};
this.writeOptions = {
bufferSize: null,
lineBreaks: null
};
}
CSV.prototype.__proto__ = EventEmitter.prototype;

// Reading API

CSV.prototype.from = function(data,options){
var self = this;
process.nextTick(function(){
parse(data);
finish();
});
return this;
}
CSV.prototype.fromStream = function(readStream, options){
if(!readStream instanceof EventEmitter) throw new Error('Invalid stream');
if(options){
merge(this.readOptions,options);
}
var self = this;
readStream.on('data', function(data) { parse(data) });
readStream.on('error', function(error) { self.emit('error', error) });
readStream.on('end', function() {
finish();
});
this.readStream = readStream;
return this;
}
CSV.prototype.fromPath = function(path, options){
merge(this.readOptions, options);
var stream = fs.createReadStream(path, this.readOptions);
stream.setEncoding(this.readOptions.encoding);
return this.fromStream(stream, null);
}

// Writting API

CSV.prototype.toStream = function(writeStream, options){
if(!writeStream instanceof EventEmitter) throw new Error('Invalid stream');
var self = this;
merge(this.writeOptions,options);
switch(this.writeOptions.lineBreaks){
case 'auto':
this.writeOptions.lineBreaks = null;
break;
case 'unix':
this.writeOptions.lineBreaks = "\n";
break;
case 'mac':
this.writeOptions.lineBreaks = "\r";
break;
case 'windows':
this.writeOptions.lineBreaks = "\r\n";
break;
case 'unicode':
this.writeOptions.lineBreaks = "\u2028";
break;
}
writeStream.on('close', function(){
self.emit('end',state.count);
})
this.writeStream = writeStream;
state.buffer = new Buffer(this.writeOptions.bufferSize||this.readOptions.bufferSize);
state.bufferPosition = 0;
return this;
}
CSV.prototype.toPath = function(path, options){
merge(options, {
flags: 'w',
encoding: 'utf8'
});
var stream = fs.createWriteStream(path, options);
return this.toStream(stream, options);
}

// Transform API

CSV.prototype.transform = function(callback){
this.transformer = callback;
return this;
}

var csv = new CSV();

function parse(chars){
chars = ''+chars;
for (var i = 0, l = chars.length; i < l; i++) {
var c = chars.charAt(i);
switch (c) {
case csv.readOptions.escape:
case csv.readOptions.quote:
if( state.commented ) break;
var isEscape = false;
if (c === csv.readOptions.escape) {
var nextChar = chars.charAt(i + 1);
if (nextChar === csv.readOptions.escape || nextChar === csv.readOptions.quote) {
i++;
isEscape = true;
c = chars.charAt(i);
state.field += c;
}
}
if (!isEscape && (c === csv.readOptions.quote)) {
if (state.field && !state.quoted) {
// Treat quote as a regular character
state.field += c;
break;
}
if (state.quoted) {
// Make sure a closing quote is followed by a separator
var nextChar = chars.charAt(i + 1);
if (nextChar && nextChar != '\r' && nextChar != '\n' && nextChar !== csv.readOptions.separator) {
throw new Error('Invalid closing quote; found "' + nextChar + '" instead of separator "' + csv.readOptions.separator + '"');
}
state.quoted = false;
} else if (state.field === '') {
state.quoted = true;
}
}
break;
case csv.readOptions.separator:
if( state.commented ) break;
if( state.quoted ) {
state.field += c;
}else{
state.line.push(state.field);
state.field = '';
}
break;
case '\n':
if( !csv.readOptions.quoted && state.lastC === '\r' ){
break;
}
case '\r':
if( csv.writeOptions.lineBreaks === null ){
csv.writeOptions.lineBreaks = c + ( c === '\r' && chars.charAt(i+1) === '\n' ? '\n' : '' );
}
state.line.push(state.field);
state.field = '';
flush();
break;
default:
if (state.commented) break;
state.field += c;
}
state.lastC = c;
}
}

function flush(){
var line = csv.transformer?csv.transformer(state.line,state.count):state.line;
if(line !== null){
csv.emit('data',line,state.count);
}
state.count++;
if(line !== null){
if(typeof line === 'object'){
if(line instanceof Array){
var newLine = '';
line.forEach(function(field,i){
var containsSeparator = field.indexOf(csv.writeOptions.separator||csv.readOptions.separator)>=0;
var containsQuote = field.indexOf(csv.writeOptions.quote||csv.readOptions.quote)>=0;
if(containsQuote){
field = field.replace(csv.writeOptions.quote||csv.readOptions.quote,(csv.writeOptions.escape||csv.readOptions.escape)+(csv.writeOptions.quote||csv.readOptions.quote));
}
if(containsQuote||containsSeparator){
field = (csv.writeOptions.quote||csv.readOptions.quote)+field+(csv.writeOptions.quote||csv.readOptions.quote);
}
newLine += field;
if(i!==line.length-1){
newLine += (csv.writeOptions.separator||csv.readOptions.separator)
}
});
line = newLine+csv.writeOptions.lineBreaks;
}else{

}
}
if(state.buffer){
if(state.bufferPosition+Buffer.byteLength(line,'utf8')>csv.readOptions.bufferSize){
csv.writeStream.write(state.buffer.slice(0, state.bufferPosition));
state.buffer = new Buffer(csv.readOptions.bufferSize);
state.bufferPosition = 0;
}
state.bufferPosition += state.buffer.write(line,state.bufferPosition,'utf8');
}
}
state.line = [];
state.lastC = '';
}
function finish(){
if (state.quoted) {
csv.emit('error', new Error('Quoted field not terminated'));
} else {
// dump open record
if (state.field) {
state.line.push(state.field);
state.field = '';
}
if (state.line.length > 0) {
flush();
}
if(csv.writeStream){
csv.writeStream.write(state.buffer.slice(0, state.bufferPosition));
csv.writeStream.end();
}else{
csv.emit('end',state.count);
}
}
}
return csv;
};
7 changes: 7 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{ "name": "csv"
, "version": "0.0.1"
, "description": "CSV parser with simple api, full of options and tested against large datasets."
, "author": "David Worms <david@adaltas.com>"
, "main": "./lib/csv"
, "engines": { "node": ">= 0.1.90" }
}
108 changes: 108 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@

<pre>
_ _ _ _____ _______ __
| \ | | | | / ____|/ ____\ \ / /
| \| | ___ __| | ___ | | | (___ \ \ / /
| . ` |/ _ \ / _` |/ _ \ | | \___ \ \ \/ /
| |\ | (_) | (_| | __/ | |____ ____) | \ /
|_| \_|\___/ \__,_|\___| \_____|_____/ \/
</pre>

This project provide CSV parsing and has been tested and used on large source file (over 2Gb).

- Line breaks discovery: when reading from a file, it is not required to provide line breaks, when writing to a file it will default to the same style.

Quick exemple
-------------

Using the library is a 3 steps process where all steps are optional:

1. Create a source
2. Create a destination
3. Transform the data

var csv = require('csv-parser');
csv()
.fromPath('/tmp/csv.in');
.toPath('/tmp/csv.out');
.transform(function(data){
data.unshift(data.pop());
return data;
});

Creating a source
-----------------

Options are:

The following method are available:

- *fromPath*
Take a file path as first argument and optionnaly on object of options as a second arguments.

- *fromStream*
Take a readable stream as first argument and optionnaly on object of options as a second arguments.

- *fromData*
Take a string, a buffer, an array or an object as first argument and optionnaly some options as a second arguments.

Creating a destination
----------------------

Options are:

- *encoding* Default to 'utf8'
- *lineBreaks* Default to 'auto', special values are 'auto', 'unix', 'mac', 'windows', 'unicode'.
- *flag* Default to 'w', 'w' to create or overwrite an file, 'a' to append to a file. Apply when using the `toPath` method.
- *bufferSize* Internal buffer holding data before being flush into a stream. Apply when destination is a stream.

The following method are available:

- *toPath*
Take a file path as first argument and optionnaly on object of options as a second arguments.

- *toStream*
Take a readable stream as first argument and optionnaly on object of options as a second arguments.

Transforming data
-----------------

You may provide a callback to the `transform` method. The contract is quite simple, you recieve an array of fields for each record and return the transformed record. The return value may be an array, an associative array, a string or null. If null, the record will simply be skipped.

Events
------

By extending the Node `EventEmitter` class, the library provide a few usefull events:

- *data* (function(data, index){})
Thrown when a new row is parsed after the `transform` callback and with the data being the value returned by `transform`. Note however that the event won't be call if transform return `null` since the record is skipped.
The callback provide two arguements:
`data` is the CSV line being processed (by default as an array)
`index` is the index number of the line starting at zero

- *end*
In case your redirecting the output to a file using the `toPath` method, the event will be called once the writing process is complete and the file closed.

- *error*
Thrown whenever an error is captured.

Running the tests
-----------------

Tests are executed with expresso. To install it, simple use `npm install expresso`.

To run the tests
expresso -I lib test/*

To develop with the tests watching at your changes
expresso -w -I lib test/*

To instrument the tests
expresso -I lib --cov test/*

Related projects
----------------

* Pavel Kolesnikov "ya-csv": http://github.com/wdavidw/ya-csv
* Chris Williams "node-csv": http://github.com/voodootikigod/node-csv

Loading

0 comments on commit c253672

Please sign in to comment.