intenthq
diff --git a/‎README.md
Lines changed: 46 additions & 11 deletions b/‎README.md
Lines changed: 46 additions & 11 deletions
diff --git a/‎anonymisations.go
Lines changed: 16 additions & 0 deletions b/‎anonymisations.go
Lines changed: 16 additions & 0 deletions
diff --git a/‎anonymisations_test.go
Lines changed: 59 additions & 8 deletions b/‎anonymisations_test.go
Lines changed: 59 additions & 8 deletions
diff --git a/‎config.go
Lines changed: 10 additions & 10 deletions b/‎config.go
Lines changed: 10 additions & 10 deletions
diff --git a/‎config_test.go
Lines changed: 4 additions & 8 deletions b/‎config_test.go
Lines changed: 4 additions & 8 deletions
diff --git a/‎config_test.json
Lines changed: 3 additions & 3 deletions b/‎config_test.json
Lines changed: 3 additions & 3 deletions
diff --git a/‎csv_processor.go
Lines changed: 70 additions & 0 deletions b/‎csv_processor.go
Lines changed: 70 additions & 0 deletions
@@ -8,7 +8,7 @@
 </a> [![Go Report Card](https://goreportcard.com/badge/github.com/intenthq/anon)](https://goreportcard.com/report/github.com/intenthq/anon) [![License](https://img.shields.io/npm/l/express.svg)](https://github.com/intenthq/anon/LICENSE)
 ![GitHub release](https://img.shields.io/github/release/intenthq/anon.svg)
 
-Anon is a tool for taking delimited files and anonymising or transforming columns until the output is useful for applications where sensitive information cannot be exposed.
+Anon is a tool for taking delimited files and anonymising or transforming columns/fields until the output is useful for applications where sensitive information cannot be exposed. Currently this tools supports both CSV and JSON files (with one level of depth).
 
 ## Installation
 
@@ -24,17 +24,20 @@ anon [--config <path to config file, default is ./config.json>]
 Anon is designed to take input from `STDIN` and by default will output the anonymised file to `STDOUT`:
 
 ```sh
-anon < some_file.csv > some_file_anonymised.csv
+anon < some_file > some_file_anonymised
 ```
 
 ### Configuration
 
-In order to be useful, Anon needs to be told what you want to do to each column of the CSV. The config is defined as a JSON file (defaults to a file called `config.json` in the current directory):
+In order to be useful, Anon needs to be told what you want to do to each column/field of the input. The config is defined as a JSON file (defaults to a file called `config.json` in the current directory):
 
 ```json5
 {
-  "csv": {
-    "delimiter": ","
+  // Name of the format of the input file
+  // Currently supports "csv" and "json"
+  "formatName": {
+    // Options for the format you have picked go here.
+    // See the documentation for the format you choose below.
   },
   // Optionally define a number of rows to randomly sample down to.
   // To do it, it will hash (using FNV-1 32 bits) the column with the ID
@@ -44,15 +47,14 @@ In order to be useful, Anon needs to be told what you want to do to each column
     // Number used to mod the hash of the id and determine if the row
     // has to be included in the sample or not
     "mod": 30000
-    // Specify in which a column a unique ID exists on which the sampling can
-    // be performed. Indices are 0 based, so this would sample on the first
-    // column.
-    "idColumn": 0
   },
   // An array of actions to take on each column - indices are 0 based, so index
   // 0 in this array corresponds to column 1, and so on.
   //
-  // There must be an action for every column in the CSV.
+  // If anonymising a CSV, there must be an action for every column in it.
+  // If anonymising a JSON, there must be an action for each field that needs to
+  // be anonymised. If there is no action defined for a specific field, this
+  // field value will be left untouched.
   "actions": [
     {
       // The no-op, leaves the input unchanged.
@@ -61,7 +63,10 @@ In order to be useful, Anon needs to be told what you want to do to each column
     {
       // Takes a UK format postcode (eg. W1W 8BE) and just keeps the outcode
       // (eg. W1W).
-      "name": "outcode"
+      "name": "outcode",
+      // what field in the json this action needs to be applied. If a field in
+      // the json doesn't have an action defined, then it will be left untouched.
+      "jsonField": "postcode"
     },
     {
       // Hash (SHA1) the input.
@@ -100,6 +105,36 @@ In order to be useful, Anon needs to be told what you want to do to each column
 }
 ```
 
+## Formats
+
+You can use CSV or JSON files as input.
+
+### CSV
+
+For a CSV file you will need a config like this:
+
+```json5
+"csv": {
+  "delimiter": ",",
+  // Specify in which column a unique ID exists on which the sampling can
+  // be performed. Indices are 0 based, so this would sample on the first
+  // column.
+  "idColumn": "0"
+}
+```
+
+### JSON
+
+For a JSON file you will need to define config like this: 
+
+```json5
+"json": {
+  // Specify in which field a unique ID exists on which the sampling can
+  // be performed.
+  "idField": "id"
+}
+```
+
 ## Contributing
 
 Any contribution will be welcome, please refer to our [contributing guidelines](CONTRIBUTING.md) for more information.
 
@@ -32,6 +32,7 @@ type RangeConfig struct {
 type ActionConfig struct {
 	Name        string
 	Salt        *string
+	JsonField   *string
 	DateConfig  DateConfig
 	RangeConfig []RangeConfig
 }
@@ -48,6 +49,21 @@ func anonymisations(configs *[]ActionConfig) ([]Anonymisation, error) {
 	return res, nil
 }
 
+// Returns a map of anonymisations according to the config, indexed by JsonField
+func anonymisationsMap(configs *[]ActionConfig) (map[string]Anonymisation, error) {
+	var err error
+	res := make(map[string]Anonymisation)
+	for _, config := range *configs {
+		if config.JsonField == nil {
+			return nil, errors.New("You need to define a JsonField for each action configured.")
+		}
+		if res[*config.JsonField], err = config.create(); err != nil {
+			return nil, err
+		}
+	}
+	return res, nil
+}
+
 // Returns the configured salt or a random one
 // if it's not set.
 func (ac *ActionConfig) saltOrRandom() string {
 
@@ -32,23 +32,74 @@ func assertAnonymisationFunction(t *testing.T, expected Anonymisation, actual An
 
 func TestAnonymisations(t *testing.T) {
 	t.Run("a valid configuration", func(t *testing.T) {
+		f1, f2 := "f1", "f2"
 		conf := &[]ActionConfig{
 			ActionConfig{
-				Name: "nothing",
+				Name:      "nothing",
+				JsonField: &f1,
 			},
 			ActionConfig{
-				Name: "hash",
-				Salt: &salt,
+				Name:      "hash",
+				Salt:      &salt,
+				JsonField: &f2,
 			},
 		}
-		anons, err := anonymisations(conf)
-		assert.NoError(t, err)
-		assertAnonymisationFunction(t, identity, anons[0], "a")
-		assertAnonymisationFunction(t, hash(salt), anons[1], "a")
+		t.Run("anonymisations should return an array with each anonymisation created", func(t *testing.T) {
+			anons, err := anonymisations(conf)
+			assert.NoError(t, err)
+			assertAnonymisationFunction(t, identity, anons[0], "a")
+			assertAnonymisationFunction(t, hash(salt), anons[1], "a")
+		})
+		t.Run("anonymisationsMap should return a map with each anonymisation created and indexed by field", func(t *testing.T) {
+			anons, err := anonymisationsMap(conf)
+			assert.NoError(t, err)
+			assertAnonymisationFunction(t, identity, anons[f1], "a")
+			assertAnonymisationFunction(t, hash(salt), anons[f2], "a")
+		})
 	})
 	t.Run("an invalid configuration", func(t *testing.T) {
 		conf := &[]ActionConfig{ActionConfig{Name: "year", DateConfig: DateConfig{Format: "3333"}}}
-		anons, err := anonymisations(conf)
+		t.Run("anonymisations should return an error", func(t *testing.T) {
+			anons, err := anonymisations(conf)
+			assert.Error(t, err, "should return an error")
+			assert.Nil(t, anons)
+		})
+		t.Run("anonymisationsMap should return an error", func(t *testing.T) {
+			anons, err := anonymisationsMap(conf)
+			assert.Error(t, err, "should return an error")
+			assert.Nil(t, anons)
+		})
+	})
+}
+
+func TestAnonymisationsMap(t *testing.T) {
+	var f1, f2 = "f1", "f2"
+	t.Run("a valid configuration", func(t *testing.T) {
+		conf := &[]ActionConfig{
+			ActionConfig{
+				JsonField: &f1,
+				Name:      "nothing",
+			},
+			ActionConfig{
+				JsonField: &f2,
+				Name:      "hash",
+				Salt:      &salt,
+			},
+		}
+		anons, err := anonymisationsMap(conf)
+		assert.NoError(t, err)
+		assertAnonymisationFunction(t, identity, anons[f1], "a")
+		assertAnonymisationFunction(t, hash(salt), anons[f2], "a")
+	})
+	t.Run("an action configuration without JsonField defined", func(t *testing.T) {
+		conf := &[]ActionConfig{ActionConfig{Name: "year"}}
+		anons, err := anonymisationsMap(conf)
+		assert.Error(t, err, "should return an error")
+		assert.Nil(t, anons)
+	})
+	t.Run("an invalid action configuration", func(t *testing.T) {
+		conf := &[]ActionConfig{ActionConfig{JsonField: &f1, Name: "year", DateConfig: DateConfig{Format: "3333"}}}
+		anons, err := anonymisationsMap(conf)
 		assert.Error(t, err, "should return an error")
 		assert.Nil(t, anons)
 	})
 
@@ -8,28 +8,29 @@ import (
 // CsvConfig stores the config to read and write the csv file
 type CsvConfig struct {
 	Delimiter string
+	IDColumn  uint32
+}
+
+// JsonConfig stores the config to read and write the json file
+type JsonConfig struct {
+	IDField string
 }
 
 // SamplingConfig stores the config to know how to sample the file
 type SamplingConfig struct {
-	Mod      uint32
-	IDColumn uint32
+	Mod uint32
 }
 
 // Config stores all the configuration
 type Config struct {
-	Csv      CsvConfig
+	Csv      *CsvConfig
+	Json     *JsonConfig
 	Sampling SamplingConfig
 	Actions  []ActionConfig
 }
 
-var defaultCsvConfig = CsvConfig{
-	Delimiter: ",",
-}
-
 var defaultSamplingConfig = SamplingConfig{
-	Mod:      1,
-	IDColumn: 0,
+	Mod: 1,
 }
 
 var defaultActionsConfig = []ActionConfig{}
@@ -42,7 +43,6 @@ func loadConfig(filename string) (*Config, error) {
 	}
 	decoder := json.NewDecoder(file)
 	conf := Config{
-		Csv:      defaultCsvConfig,
 		Sampling: defaultSamplingConfig,
 		Actions:  defaultActionsConfig,
 	}
 
@@ -22,12 +22,8 @@ func TestLoadConfig(t *testing.T) {
 		conf, err := loadConfig("config_defaults_test.json")
 		require.NoError(t, err, "should return no error if the config can be loaded")
 		assert.Equal(t, Config{
-			Csv: CsvConfig{
-				Delimiter: ",",
-			},
 			Sampling: SamplingConfig{
-				Mod:      1,
-				IDColumn: 0,
+				Mod: 1,
 			},
 			Actions: []ActionConfig{},
 		}, *conf, "should fill the config with the default values")
@@ -39,12 +35,12 @@ func TestLoadConfig(t *testing.T) {
 		conf, err := loadConfig("config_test.json")
 		require.NoError(t, err, "should return no error if the config can be loaded")
 		assert.Equal(t, Config{
-			Csv: CsvConfig{
+			Csv: &CsvConfig{
 				Delimiter: "|",
+				IDColumn:  84,
 			},
 			Sampling: SamplingConfig{
-				Mod:      77,
-				IDColumn: 84,
+				Mod: 77,
 			},
 			Actions: []ActionConfig{
 				ActionConfig{
 
@@ -1,10 +1,10 @@
 {
   "csv": {
-    "delimiter": "|"
+    "delimiter": "|",
+    "idColumn": 84
   },
   "sampling": {
-    "mod": 77,
-    "idColumn": 84
+    "mod": 77
   },
   "actions": [
     {
 
@@ -0,0 +1,70 @@
+package main
+
+import (
+	"encoding/csv"
+	"fmt"
+	"io"
+	"log"
+	"os"
+)
+
+func processCsv(inputFile string, outputFile string, conf *Config) error {
+	r := initCsvReader(inputFile, conf.Csv)
+	w := initCsvWriter(outputFile, conf.Csv)
+
+	anons, err := anonymisations(&conf.Actions)
+	if err != nil {
+		return err
+	}
+
+	if err := anonymiseCsv(r, w, conf, &anons); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func initCsvReader(filename string, conf *CsvConfig) *csv.Reader {
+	reader := csv.NewReader(fileOr(filename, os.Stdin, os.Open))
+	reader.Comma = []rune(conf.Delimiter)[0]
+	return reader
+}
+
+func initCsvWriter(filename string, conf *CsvConfig) *csv.Writer {
+	writer := csv.NewWriter(fileOr(filename, os.Stdout, os.Create))
+	writer.Comma = []rune(conf.Delimiter)[0]
+	return writer
+}
+
+func anonymiseCsv(r *csv.Reader, w *csv.Writer, conf *Config, anons *[]Anonymisation) error {
+	i := 0
+
+	for {
+		record, err := r.Read()
+		if err == io.EOF {
+			break
+		} else if pe, ok := err.(*csv.ParseError); ok && pe.Err == csv.ErrFieldCount {
+			// we just print the error and skip the record
+			log.Print(err)
+		} else if err != nil {
+			return err
+		} else if int64(conf.Csv.IDColumn) >= int64(len(record)) {
+			return fmt.Errorf("id column (%d) out of range, record has %d columns", conf.Csv.IDColumn, len(record))
+		} else if sample(record[conf.Csv.IDColumn], conf.Sampling) {
+			anonymised, err := anonymise(record, *anons)
+			if err != nil {
+				// we just print the error and skip the record
+				log.Print(err)
+			} else {
+				w.Write(anonymised)
+			}
+			//TODO decide how often do we want to flush
+			if i%100 == 0 {
+				w.Flush()
+			}
+		}
+		i++
+	}
+	w.Flush()
+	return nil
+}
Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,10 @@`
`1`	`1`	`{`
`2`	`2`	`"csv": {`
`3`		`- "delimiter": "\|"`
	`3`	`+ "delimiter": "\|",`
	`4`	`+ "idColumn": 84`
`4`	`5`	`},`
`5`	`6`	`"sampling": {`
`6`		`- "mod": 77,`
`7`		`- "idColumn": 84`
	`7`	`+ "mod": 77`
`8`	`8`	`},`
`9`	`9`	`"actions": [`
`10`	`10`	`{`