Skip to content
This repository was archived by the owner on Sep 26, 2023. It is now read-only.

Commit 744ecb9

Browse files
committedApr 1, 2019
Add basic JSON support
By applying this commit the `anon` tool will be able to read, anonymise and output basic JSON files. It currently only supports one level JSON fields. Some small refactor has been done and it's more than probable that the solution could be a bit DRYer, but don't have time to do so. Signed-off-by: Albert Pastrana <albert.pastrana@intenthq.com>
1 parent fecd661 commit 744ecb9

12 files changed

+473
-218
lines changed
 

‎README.md

+46-11
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
</a> [![Go Report Card](https://goreportcard.com/badge/github.com/intenthq/anon)](https://goreportcard.com/report/github.com/intenthq/anon) [![License](https://img.shields.io/npm/l/express.svg)](https://github.com/intenthq/anon/LICENSE)
99
![GitHub release](https://img.shields.io/github/release/intenthq/anon.svg)
1010

11-
Anon is a tool for taking delimited files and anonymising or transforming columns until the output is useful for applications where sensitive information cannot be exposed.
11+
Anon is a tool for taking delimited files and anonymising or transforming columns/fields until the output is useful for applications where sensitive information cannot be exposed. Currently this tools supports both CSV and JSON files (with one level of depth).
1212

1313
## Installation
1414

@@ -24,17 +24,20 @@ anon [--config <path to config file, default is ./config.json>]
2424
Anon is designed to take input from `STDIN` and by default will output the anonymised file to `STDOUT`:
2525

2626
```sh
27-
anon < some_file.csv > some_file_anonymised.csv
27+
anon < some_file > some_file_anonymised
2828
```
2929

3030
### Configuration
3131

32-
In order to be useful, Anon needs to be told what you want to do to each column of the CSV. The config is defined as a JSON file (defaults to a file called `config.json` in the current directory):
32+
In order to be useful, Anon needs to be told what you want to do to each column/field of the input. The config is defined as a JSON file (defaults to a file called `config.json` in the current directory):
3333

3434
```json5
3535
{
36-
"csv": {
37-
"delimiter": ","
36+
// Name of the format of the input file
37+
// Currently supports "csv" and "json"
38+
"formatName": {
39+
// Options for the format you have picked go here.
40+
// See the documentation for the format you choose below.
3841
},
3942
// Optionally define a number of rows to randomly sample down to.
4043
// To do it, it will hash (using FNV-1 32 bits) the column with the ID
@@ -44,15 +47,14 @@ In order to be useful, Anon needs to be told what you want to do to each column
4447
// Number used to mod the hash of the id and determine if the row
4548
// has to be included in the sample or not
4649
"mod": 30000
47-
// Specify in which a column a unique ID exists on which the sampling can
48-
// be performed. Indices are 0 based, so this would sample on the first
49-
// column.
50-
"idColumn": 0
5150
},
5251
// An array of actions to take on each column - indices are 0 based, so index
5352
// 0 in this array corresponds to column 1, and so on.
5453
//
55-
// There must be an action for every column in the CSV.
54+
// If anonymising a CSV, there must be an action for every column in it.
55+
// If anonymising a JSON, there must be an action for each field that needs to
56+
// be anonymised. If there is no action defined for a specific field, this
57+
// field value will be left untouched.
5658
"actions": [
5759
{
5860
// The no-op, leaves the input unchanged.
@@ -61,7 +63,10 @@ In order to be useful, Anon needs to be told what you want to do to each column
6163
{
6264
// Takes a UK format postcode (eg. W1W 8BE) and just keeps the outcode
6365
// (eg. W1W).
64-
"name": "outcode"
66+
"name": "outcode",
67+
// what field in the json this action needs to be applied. If a field in
68+
// the json doesn't have an action defined, then it will be left untouched.
69+
"jsonField": "postcode"
6570
},
6671
{
6772
// Hash (SHA1) the input.
@@ -100,6 +105,36 @@ In order to be useful, Anon needs to be told what you want to do to each column
100105
}
101106
```
102107

108+
## Formats
109+
110+
You can use CSV or JSON files as input.
111+
112+
### CSV
113+
114+
For a CSV file you will need a config like this:
115+
116+
```json5
117+
"csv": {
118+
"delimiter": ",",
119+
// Specify in which column a unique ID exists on which the sampling can
120+
// be performed. Indices are 0 based, so this would sample on the first
121+
// column.
122+
"idColumn": "0"
123+
}
124+
```
125+
126+
### JSON
127+
128+
For a JSON file you will need to define config like this:
129+
130+
```json5
131+
"json": {
132+
// Specify in which field a unique ID exists on which the sampling can
133+
// be performed.
134+
"idField": "id"
135+
}
136+
```
137+
103138
## Contributing
104139

105140
Any contribution will be welcome, please refer to our [contributing guidelines](CONTRIBUTING.md) for more information.

‎anonymisations.go

+16
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type RangeConfig struct {
3232
type ActionConfig struct {
3333
Name string
3434
Salt *string
35+
JsonField *string
3536
DateConfig DateConfig
3637
RangeConfig []RangeConfig
3738
}
@@ -48,6 +49,21 @@ func anonymisations(configs *[]ActionConfig) ([]Anonymisation, error) {
4849
return res, nil
4950
}
5051

52+
// Returns a map of anonymisations according to the config, indexed by JsonField
53+
func anonymisationsMap(configs *[]ActionConfig) (map[string]Anonymisation, error) {
54+
var err error
55+
res := make(map[string]Anonymisation)
56+
for _, config := range *configs {
57+
if config.JsonField == nil {
58+
return nil, errors.New("You need to define a JsonField for each action configured.")
59+
}
60+
if res[*config.JsonField], err = config.create(); err != nil {
61+
return nil, err
62+
}
63+
}
64+
return res, nil
65+
}
66+
5167
// Returns the configured salt or a random one
5268
// if it's not set.
5369
func (ac *ActionConfig) saltOrRandom() string {

‎anonymisations_test.go

+59-8
Original file line numberDiff line numberDiff line change
@@ -32,23 +32,74 @@ func assertAnonymisationFunction(t *testing.T, expected Anonymisation, actual An
3232

3333
func TestAnonymisations(t *testing.T) {
3434
t.Run("a valid configuration", func(t *testing.T) {
35+
f1, f2 := "f1", "f2"
3536
conf := &[]ActionConfig{
3637
ActionConfig{
37-
Name: "nothing",
38+
Name: "nothing",
39+
JsonField: &f1,
3840
},
3941
ActionConfig{
40-
Name: "hash",
41-
Salt: &salt,
42+
Name: "hash",
43+
Salt: &salt,
44+
JsonField: &f2,
4245
},
4346
}
44-
anons, err := anonymisations(conf)
45-
assert.NoError(t, err)
46-
assertAnonymisationFunction(t, identity, anons[0], "a")
47-
assertAnonymisationFunction(t, hash(salt), anons[1], "a")
47+
t.Run("anonymisations should return an array with each anonymisation created", func(t *testing.T) {
48+
anons, err := anonymisations(conf)
49+
assert.NoError(t, err)
50+
assertAnonymisationFunction(t, identity, anons[0], "a")
51+
assertAnonymisationFunction(t, hash(salt), anons[1], "a")
52+
})
53+
t.Run("anonymisationsMap should return a map with each anonymisation created and indexed by field", func(t *testing.T) {
54+
anons, err := anonymisationsMap(conf)
55+
assert.NoError(t, err)
56+
assertAnonymisationFunction(t, identity, anons[f1], "a")
57+
assertAnonymisationFunction(t, hash(salt), anons[f2], "a")
58+
})
4859
})
4960
t.Run("an invalid configuration", func(t *testing.T) {
5061
conf := &[]ActionConfig{ActionConfig{Name: "year", DateConfig: DateConfig{Format: "3333"}}}
51-
anons, err := anonymisations(conf)
62+
t.Run("anonymisations should return an error", func(t *testing.T) {
63+
anons, err := anonymisations(conf)
64+
assert.Error(t, err, "should return an error")
65+
assert.Nil(t, anons)
66+
})
67+
t.Run("anonymisationsMap should return an error", func(t *testing.T) {
68+
anons, err := anonymisationsMap(conf)
69+
assert.Error(t, err, "should return an error")
70+
assert.Nil(t, anons)
71+
})
72+
})
73+
}
74+
75+
func TestAnonymisationsMap(t *testing.T) {
76+
var f1, f2 = "f1", "f2"
77+
t.Run("a valid configuration", func(t *testing.T) {
78+
conf := &[]ActionConfig{
79+
ActionConfig{
80+
JsonField: &f1,
81+
Name: "nothing",
82+
},
83+
ActionConfig{
84+
JsonField: &f2,
85+
Name: "hash",
86+
Salt: &salt,
87+
},
88+
}
89+
anons, err := anonymisationsMap(conf)
90+
assert.NoError(t, err)
91+
assertAnonymisationFunction(t, identity, anons[f1], "a")
92+
assertAnonymisationFunction(t, hash(salt), anons[f2], "a")
93+
})
94+
t.Run("an action configuration without JsonField defined", func(t *testing.T) {
95+
conf := &[]ActionConfig{ActionConfig{Name: "year"}}
96+
anons, err := anonymisationsMap(conf)
97+
assert.Error(t, err, "should return an error")
98+
assert.Nil(t, anons)
99+
})
100+
t.Run("an invalid action configuration", func(t *testing.T) {
101+
conf := &[]ActionConfig{ActionConfig{JsonField: &f1, Name: "year", DateConfig: DateConfig{Format: "3333"}}}
102+
anons, err := anonymisationsMap(conf)
52103
assert.Error(t, err, "should return an error")
53104
assert.Nil(t, anons)
54105
})

‎config.go

+10-10
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,29 @@ import (
88
// CsvConfig stores the config to read and write the csv file
99
type CsvConfig struct {
1010
Delimiter string
11+
IDColumn uint32
12+
}
13+
14+
// JsonConfig stores the config to read and write the json file
15+
type JsonConfig struct {
16+
IDField string
1117
}
1218

1319
// SamplingConfig stores the config to know how to sample the file
1420
type SamplingConfig struct {
15-
Mod uint32
16-
IDColumn uint32
21+
Mod uint32
1722
}
1823

1924
// Config stores all the configuration
2025
type Config struct {
21-
Csv CsvConfig
26+
Csv *CsvConfig
27+
Json *JsonConfig
2228
Sampling SamplingConfig
2329
Actions []ActionConfig
2430
}
2531

26-
var defaultCsvConfig = CsvConfig{
27-
Delimiter: ",",
28-
}
29-
3032
var defaultSamplingConfig = SamplingConfig{
31-
Mod: 1,
32-
IDColumn: 0,
33+
Mod: 1,
3334
}
3435

3536
var defaultActionsConfig = []ActionConfig{}
@@ -42,7 +43,6 @@ func loadConfig(filename string) (*Config, error) {
4243
}
4344
decoder := json.NewDecoder(file)
4445
conf := Config{
45-
Csv: defaultCsvConfig,
4646
Sampling: defaultSamplingConfig,
4747
Actions: defaultActionsConfig,
4848
}

‎config_test.go

+4-8
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,8 @@ func TestLoadConfig(t *testing.T) {
2222
conf, err := loadConfig("config_defaults_test.json")
2323
require.NoError(t, err, "should return no error if the config can be loaded")
2424
assert.Equal(t, Config{
25-
Csv: CsvConfig{
26-
Delimiter: ",",
27-
},
2825
Sampling: SamplingConfig{
29-
Mod: 1,
30-
IDColumn: 0,
26+
Mod: 1,
3127
},
3228
Actions: []ActionConfig{},
3329
}, *conf, "should fill the config with the default values")
@@ -39,12 +35,12 @@ func TestLoadConfig(t *testing.T) {
3935
conf, err := loadConfig("config_test.json")
4036
require.NoError(t, err, "should return no error if the config can be loaded")
4137
assert.Equal(t, Config{
42-
Csv: CsvConfig{
38+
Csv: &CsvConfig{
4339
Delimiter: "|",
40+
IDColumn: 84,
4441
},
4542
Sampling: SamplingConfig{
46-
Mod: 77,
47-
IDColumn: 84,
43+
Mod: 77,
4844
},
4945
Actions: []ActionConfig{
5046
ActionConfig{

‎config_test.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"csv": {
3-
"delimiter": "|"
3+
"delimiter": "|",
4+
"idColumn": 84
45
},
56
"sampling": {
6-
"mod": 77,
7-
"idColumn": 84
7+
"mod": 77
88
},
99
"actions": [
1010
{

‎csv_processor.go

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
package main
2+
3+
import (
4+
"encoding/csv"
5+
"fmt"
6+
"io"
7+
"log"
8+
"os"
9+
)
10+
11+
func processCsv(inputFile string, outputFile string, conf *Config) error {
12+
r := initCsvReader(inputFile, conf.Csv)
13+
w := initCsvWriter(outputFile, conf.Csv)
14+
15+
anons, err := anonymisations(&conf.Actions)
16+
if err != nil {
17+
return err
18+
}
19+
20+
if err := anonymiseCsv(r, w, conf, &anons); err != nil {
21+
return err
22+
}
23+
24+
return nil
25+
}
26+
27+
func initCsvReader(filename string, conf *CsvConfig) *csv.Reader {
28+
reader := csv.NewReader(fileOr(filename, os.Stdin, os.Open))
29+
reader.Comma = []rune(conf.Delimiter)[0]
30+
return reader
31+
}
32+
33+
func initCsvWriter(filename string, conf *CsvConfig) *csv.Writer {
34+
writer := csv.NewWriter(fileOr(filename, os.Stdout, os.Create))
35+
writer.Comma = []rune(conf.Delimiter)[0]
36+
return writer
37+
}
38+
39+
func anonymiseCsv(r *csv.Reader, w *csv.Writer, conf *Config, anons *[]Anonymisation) error {
40+
i := 0
41+
42+
for {
43+
record, err := r.Read()
44+
if err == io.EOF {
45+
break
46+
} else if pe, ok := err.(*csv.ParseError); ok && pe.Err == csv.ErrFieldCount {
47+
// we just print the error and skip the record
48+
log.Print(err)
49+
} else if err != nil {
50+
return err
51+
} else if int64(conf.Csv.IDColumn) >= int64(len(record)) {
52+
return fmt.Errorf("id column (%d) out of range, record has %d columns", conf.Csv.IDColumn, len(record))
53+
} else if sample(record[conf.Csv.IDColumn], conf.Sampling) {
54+
anonymised, err := anonymise(record, *anons)
55+
if err != nil {
56+
// we just print the error and skip the record
57+
log.Print(err)
58+
} else {
59+
w.Write(anonymised)
60+
}
61+
//TODO decide how often do we want to flush
62+
if i%100 == 0 {
63+
w.Flush()
64+
}
65+
}
66+
i++
67+
}
68+
w.Flush()
69+
return nil
70+
}

0 commit comments

Comments
 (0)
Failed to load comments.