Skip to content

Commit

Permalink
feat: Optional Pandas support, export to CSV/XLSX
Browse files Browse the repository at this point in the history
  • Loading branch information
tkarabela committed Nov 17, 2023
1 parent b7295e7 commit 10468d0
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 12 deletions.
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@ _okane_ is a pure Python parser for bank statements in camt.053 XML format [[1]]
used by the Czech Banking Association (ČBA) [[2]].

It parses `BkToCstmrStmt` XML element into `okane.BankToCustomerStatement` which is
a Pydantic model. It can also work as a CLI tool, converting camt.053 XML to JSON.
a Pydantic model. It can also work as a CLI tool, converting camt.053 XML files to JSON or CSV.

## Installation

```shell
pip install okane

# or, if you'd like to use the CSV, XLSX export features and access the data as `pd.DataFrame`
pip install okane[pandas]
```

## Example
Expand All @@ -38,6 +41,7 @@ pip install okane
BankId(bic='REVOLT21', id=None)
>>> statement.transactions[3].ref
TransactionRef(message_id='XXX', end_to_end_id='XXX', account_servicer_ref=None, payment_invocation_id=None, instruction_id=None, mandate_id=None, cheque_number=None, clearing_system_ref=None)
>>> df = statement.as_dataframe()

### Command-line interface

Expand All @@ -57,6 +61,10 @@ head ./tests/data/test2.xml
```

```shell
# okane ./tests/data/test*.xml -f json --no-indent -o output.jsonl
# okane ./tests/data/test*.xml -f csv -o output.csv
# okane ./tests/data/test*.xml -f xlsx -o output.xlsx

okane ./tests/data/test2.xml
```

Expand Down
90 changes: 80 additions & 10 deletions okane.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,19 @@

import argparse
import sys
from typing import Optional
from typing import Optional, Any
from lxml import etree
from lxml.etree import _Element
from io import BytesIO
from io import BytesIO, StringIO
from pydantic import BaseModel
from enum import Enum
import datetime
from decimal import Decimal
import warnings
try:
import pandas as pd
except ImportError:
pd = None # type: ignore[assignment]


__version__ = "0.1.0"
Expand Down Expand Up @@ -231,6 +235,16 @@ def from_file(cls, path: str) -> "BankToCustomerStatement":

return parse_statement(root)

def as_dataframe(self) -> "pd.DataFrame":
if pd is None:
raise RuntimeError("pandas is not installed")

rows = [flatten_dict(tx.model_dump(), prefix="transaction.") for tx in self.transactions]
df = pd.DataFrame.from_records(rows)
df["statement.id"] = self.statement_id
df["statement.account_id"] = str(self.account_id)
return df


def parse_statement(root: _Element) -> BankToCustomerStatement:
stmt = get_element(root, "BkToCstmrStmt/Stmt")
Expand Down Expand Up @@ -335,20 +349,76 @@ def parse_date_isoformat(s: str) -> datetime.date:
return datetime.date.fromisoformat(s[:10])


def flatten_dict(d: dict[str, Any], prefix: str = "") -> dict[str, Any]:
output = {}
for k, v in d.items():
if isinstance(v, dict):
output.update(flatten_dict(v, prefix=f"{prefix}{k}."))
else:
output[f"{prefix}{k}"] = v
return output


class OutputFormat(str, Enum):
JSON = "json"
CSV = "csv"
XLSX = "xlsx"


def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("input_files", nargs="+", metavar="statement.xml")
parser.add_argument("input_files", nargs="+", metavar="statement.xml",
help="path to input camt.053 XML file(s)")
parser.add_argument("--version", "-V", action="version", version=__version__)
parser.add_argument("--output", "-o", metavar="FILE", default="-", help="path to output file "
"(default: write to stdout)")
parser.add_argument("--format", "-f", choices=[fmt.value for fmt in OutputFormat],
type=OutputFormat, default=OutputFormat.JSON, help="set output format (default: json)")
parser.add_argument("--no-indent", action="store_true", help="do not indent JSON output files")

args = parser.parse_args(argv)
input_files = args.input_files

if len(input_files) == 1:
statement = BankToCustomerStatement.from_file(input_files[0])
print(statement.model_dump_json(indent=4))
output_path = args.output
output_format = args.format
no_indent = args.no_indent

statements = [BankToCustomerStatement.from_file(path) for path in input_files]

output_bytes = b""

match output_format:
case OutputFormat.JSON:
for statement in statements:
output_bytes += statement.model_dump_json(indent=None if no_indent else 4).encode("utf-8")
output_bytes += b"\n"
case OutputFormat.CSV:
dfs = []
for statement in statements:
df = statement.as_dataframe()
dfs.append(df)
assert pd is not None
all_df = pd.concat(dfs)
buf = StringIO()
all_df.to_csv(buf, index=False)
output_bytes = buf.getvalue().encode("utf-8")
case OutputFormat.XLSX:
dfs = []
for statement in statements:
df = statement.as_dataframe()
dfs.append(df)
assert pd is not None
all_df = pd.concat(dfs)
buf_bin = BytesIO()
all_df.to_excel(buf_bin, index=False)
output_bytes = buf_bin.getvalue()
case _:
raise NotImplementedError(f"Unsupported output format {output_format}")

if output_path == "-":
sys.stdout.buffer.write(output_bytes)
else:
for path in input_files:
statement = BankToCustomerStatement.from_file(path)
print(statement.model_dump_json())
with open(output_path, "wb") as fp:
fp.write(output_bytes)

return 0

Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,18 @@ include = [
python = "^3.10"
lxml = "^4.9"
pydantic = "^2.5"
pandas = { version = "^2.1", optional = true }
openpyxl = { version = "^3.1", optional = true }

[tool.poetry.extras]
pandas = ["pandas", "openpyxl"]

[tool.poetry.group.dev.dependencies]
mypy = "^1.1"
pytest = "^7.4"
pytest-cov = "^4.0"
lxml-stubs = "^0.4"
pandas-stubs = "^2.1"

[tool.poetry.scripts]
okane = "okane:main"
Expand Down
56 changes: 55 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import os.path as op
import json
from io import StringIO, BytesIO
import pytest
import okane
try:
import pandas as pd
except Exception:
pd = None


def test_cli_to_json(capsys):
Expand All @@ -18,7 +24,7 @@ def test_cli_to_json_multiple(capsys):
path1 = op.join(op.dirname(__file__), "./data/test1.xml")
path2 = op.join(op.dirname(__file__), "./data/test2.xml")

assert 0 == okane.main([path1, path2])
assert 0 == okane.main([path1, path2, "--no-indent"])
output = capsys.readouterr().out
output_lines = output.splitlines()
output_dict1 = json.loads(output_lines[0])
Expand All @@ -28,6 +34,54 @@ def test_cli_to_json_multiple(capsys):
assert output_dict2 == TEST2_REFERENCE_DICT


@pytest.mark.skipif(pd is None, reason="requires pandas")
def test_cli_to_csv_multiple(capsys):
path1 = op.join(op.dirname(__file__), "./data/test1.xml")
path2 = op.join(op.dirname(__file__), "./data/test2.xml")

statement1 = okane.BankToCustomerStatement.from_file(path1)
statement2 = okane.BankToCustomerStatement.from_file(path2)
df_ref = pd.concat(s.as_dataframe() for s in [statement1, statement2])

assert 0 == okane.main([path1, path2, "-f", "csv"])

output = capsys.readouterr().out
buf = StringIO(output)
df = pd.read_csv(buf)

df_ = df.where(pd.notnull(df), None).reset_index().map(str)
df_ref_ = df_ref.where(pd.notnull(df_ref), None).reset_index().map(str)

assert (df_["statement.id"] == df_ref_["statement.id"]).all()
assert (df_["transaction.entry_ref"] == df_ref_["transaction.entry_ref"]).all()
assert (df_["transaction.val_date"] == df_ref_["transaction.val_date"]).all()
# TODO compare more thoroughly


@pytest.mark.skipif(pd is None, reason="requires pandas")
def test_cli_to_excel_multiple(capsysbinary):
path1 = op.join(op.dirname(__file__), "./data/test1.xml")
path2 = op.join(op.dirname(__file__), "./data/test2.xml")

statement1 = okane.BankToCustomerStatement.from_file(path1)
statement2 = okane.BankToCustomerStatement.from_file(path2)
df_ref = pd.concat(s.as_dataframe() for s in [statement1, statement2])


assert 0 == okane.main([path1, path2, "-f", "xlsx"])

output = capsysbinary.readouterr().out
buf = BytesIO(output)
df = pd.read_excel(buf)

df_ = df.where(pd.notnull(df), None).reset_index().map(str)
df_ref_ = df_ref.where(pd.notnull(df_ref), None).reset_index().map(str)

assert (df_["statement.id"] == df_ref_["statement.id"]).all()
assert (df_["transaction.entry_ref"] == df_ref_["transaction.entry_ref"]).all()
# TODO compare more thoroughly


TEST1_REFERENCE_DICT = {'account_id': {'iban': 'XXX-IBAN', 'id': None},
'closing_balance': {'amount': '2000.00',
'currency': 'CZK',
Expand Down

0 comments on commit 10468d0

Please sign in to comment.