Skip to content

Commit 328718f

Browse files
committed
Added support for JSON inputs, closes simonw#12
1 parent 5a598ca commit 328718f

File tree

5 files changed

+80
-10
lines changed

5 files changed

+80
-10
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
[![Tests](https://github.com/simonw/csv-diff/workflows/Test/badge.svg)](https://github.com/simonw/csv-diff/actions?query=workflow%3ATest)
66
[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/simonw/csv-diff/blob/main/LICENSE)
77

8-
Tool for viewing the difference between two CSV files. See [Generating a commit log for San Francisco’s official list of trees](https://simonwillison.net/2019/Mar/13/tree-history/) (and the [sf-tree-history repo commit log](https://github.com/simonw/sf-tree-history/commits)) for background information on this project.
8+
Tool for viewing the difference between two CSV, TSV or JSON files. See [Generating a commit log for San Francisco’s official list of trees](https://simonwillison.net/2019/Mar/13/tree-history/) (and the [sf-tree-history repo commit log](https://github.com/simonw/sf-tree-history/commits)) for background information on this project.
99

1010
## Installation
1111

@@ -53,6 +53,8 @@ The `--key=id` option means that the `id` column should be treated as the unique
5353

5454
The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.
5555

56+
You can also feed it JSON files, provided they are a JSON array of objects where each object has the same keys. Use `--format=json` if your input files are JSON.
57+
5658
Use `--show-unchanged` to include full details of the unchanged values for rows with at least one change in the diff output:
5759

5860
% csv-diff one.csv two.csv --key=id --show-unchanged

csv_diff/__init__.py

+20
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,26 @@ def load_csv(fp, key=None, dialect=None):
2626
return {keyfn(r): r for r in rows}
2727

2828

29+
def load_json(fp, key=None):
30+
raw_list = json.load(fp)
31+
assert isinstance(raw_list, list)
32+
if key:
33+
keyfn = lambda r: r[key]
34+
else:
35+
keyfn = lambda r: hashlib.sha1(
36+
json.dumps(r, sort_keys=True).encode("utf8")
37+
).hexdigest()
38+
return {keyfn(r): _simplify_json_row(r) for r in raw_list}
39+
40+
41+
def _simplify_json_row(r):
42+
# Convert list/dict values into JSON serialized strings
43+
for key, value in r.items():
44+
if isinstance(value, (dict, tuple, list)):
45+
r[key] = json.dumps(value)
46+
return r
47+
48+
2949
def compare(previous, current, show_unchanged=False):
3050
result = {
3151
"added": [],

csv_diff/cli.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import click
22
import json as std_json
3-
from . import load_csv, compare, human_text
3+
from . import load_csv, load_json, compare, human_text
44

55

66
@click.command()
@@ -18,9 +18,9 @@
1818
)
1919
@click.option(
2020
"--format",
21-
type=click.Choice(["csv", "tsv"]),
21+
type=click.Choice(["csv", "tsv", "json"]),
2222
default=None,
23-
help="Explicitly specify input format (csv, tsv) instead of auto-detecting",
23+
help="Explicitly specify input format (csv, tsv, json) instead of auto-detecting",
2424
)
2525
@click.option(
2626
"--json", type=bool, default=False, help="Output changes as JSON", is_flag=True
@@ -43,16 +43,19 @@
4343
help="Show unchanged fields for rows with at least one change",
4444
)
4545
def cli(previous, current, key, format, json, singular, plural, show_unchanged):
46-
"Diff two CSV files"
46+
"Diff two CSV or JSON files"
4747
dialect = {
4848
"csv": "excel",
4949
"tsv": "excel-tab",
5050
}
5151

5252
def load(filename):
53-
return load_csv(
54-
open(filename, newline=""), key=key, dialect=dialect.get(format)
55-
)
53+
if format == "json":
54+
return load_json(open(filename), key=key)
55+
else:
56+
return load_csv(
57+
open(filename, newline=""), key=key, dialect=dialect.get(format)
58+
)
5659

5760
diff = compare(load(previous), load(current), show_unchanged)
5861
if json:

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def get_long_description():
1515

1616
setup(
1717
name="csv-diff",
18-
description="Python CLI tool and library for diffing CSV files",
18+
description="Python CLI tool and library for diffing CSV and JSON files",
1919
long_description=get_long_description(),
2020
long_description_content_type="text/markdown",
2121
author="Simon Willison",

tests/test_cli.py

+46-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from click.testing import CliRunner
2-
from csv_diff import cli
2+
from csv_diff import cli, load_csv
3+
import csv
34
import pytest
45
from .test_csv_diff import ONE, ONE_TSV, TWO, TWO_TSV, THREE, FIVE
6+
import io
57
import json
68
from textwrap import dedent
79

@@ -15,6 +17,29 @@ def tsv_files(tmpdir):
1517
return str(one), str(two)
1618

1719

20+
@pytest.fixture
21+
def json_files(tmpdir):
22+
one = tmpdir / "one.json"
23+
one.write(
24+
json.dumps(
25+
[
26+
{"id": 1, "name": "Cleo", "nested": {"foo": 3}},
27+
{"id": 2, "name": "Pancakes", "nested": {"foo": 3}},
28+
]
29+
)
30+
)
31+
two = tmpdir / "two.json"
32+
two.write(
33+
json.dumps(
34+
[
35+
{"id": 1, "name": "Cleo", "nested": {"foo": 3, "bar": 5}},
36+
{"id": 2, "name": "Pancakes!", "nested": {"foo": 3}},
37+
]
38+
)
39+
)
40+
return str(one), str(two)
41+
42+
1843
def test_human_cli(tmpdir):
1944
one = tmpdir / "one.csv"
2045
one.write(ONE)
@@ -101,6 +126,26 @@ def test_tsv_files(tsv_files):
101126
} == json.loads(result.output.strip())
102127

103128

129+
def test_json_files(json_files):
130+
one, two = json_files
131+
result = CliRunner().invoke(
132+
cli.cli,
133+
[one, two, "--key", "id", "--json", "--format", "json"],
134+
catch_exceptions=False,
135+
)
136+
assert 0 == result.exit_code
137+
assert {
138+
"added": [],
139+
"removed": [],
140+
"changed": [
141+
{"key": 1, "changes": {"nested": ['{"foo": 3}', '{"foo": 3, "bar": 5}']}},
142+
{"key": 2, "changes": {"name": ["Pancakes", "Pancakes!"]}},
143+
],
144+
"columns_added": [],
145+
"columns_removed": [],
146+
} == json.loads(result.output.strip())
147+
148+
104149
def test_sniff_format(tsv_files):
105150
one, two = tsv_files
106151
result = CliRunner().invoke(cli.cli, [one, two, "--key", "id", "--json"])

0 commit comments

Comments
 (0)