From 8bd36dcf23a5e8bdcc5f7c0e8470e310cd0db054 Mon Sep 17 00:00:00 2001
From: dr-duplo <monsieur.cm@gmx.de>
Date: Mon, 29 Jan 2018 20:59:42 +0100
Subject: [PATCH 1/5] Identify wrong interpreted tag ids

The change enables the transaction parser to identify
tag ids which seem to be valid but are artefacts of line wraps
in the content of another tag. This occurs if the content contains
character sequences like ...12:23:43... which happens to be a
timestamp but looks like a tag id when wrapped unfortunatly like:
12\n\r
:12:43  --> invalid tag id :12:
---
 mt940/models.py | 61 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/mt940/models.py b/mt940/models.py
index 3f158d4..43f8f56 100644
--- a/mt940/models.py
+++ b/mt940/models.py
@@ -315,6 +315,48 @@ def strip(cls, lines):
             if line:
                 yield line
 
+    @classmethod
+    def normalize_tag_id(cls, tag_id):
+        # Since non-digit tags exist, make the conversion optional
+        if tag_id.isdigit():
+            tag_id = int(tag_id)
+
+        return tag_id
+
+    @classmethod
+    def sanatize_tag_id_matches(cls, tag_matches):
+        i_next = 0
+        for i, match in enumerate(tag_matches):
+            # tag match was rejected
+            if i < i_next:
+                continue
+
+            # next match would be
+            i_next = i + 1
+
+            # normalize tag id
+            tag_id = cls.normalize_tag_id(match.group('tag'))
+
+            # tag should be known
+            assert tag_id in mt940.tags.TAG_BY_ID, 'Unknown tag %r ' \
+                'in line: %r' % (tag_id, match.group(0))
+
+            # special treatnment for long tag content with possible
+            # bad line wrap which produces tag_id like line beginnings
+            # seen in :86: tag
+            if tag_id == mt940.tags.Tags.TRANSACTION_DETAILS.value.id:
+                # search subsequent tags for unknown tag ids
+                # these lines likely belong to the previous tag
+                for j in range(i+1, len(tag_matches)):
+                    next_tag_id = cls.normalize_tag_id(tag_matches[j].group('tag'))
+                    if next_tag_id in mt940.tags.TAG_BY_ID:
+                        # extend range to found valid tag
+                        i_next = j
+                        break;
+
+            # a valid tag match
+            yield { 'tag_id': tag_id , 'start': i, 'end': i_next - 1 }                
+                
     def parse(self, data):
         '''Parses mt940 data, expects a string with data
 
@@ -334,16 +376,15 @@ def parse(self, data):
             re.MULTILINE)
         matches = list(tag_re.finditer(data))
 
-        for i, match in enumerate(matches):
-            tag_id = match.group('tag')
-            # Since non-digit tags exist, make the conversion optional
+        # identify valid matches
+        valid_match_ranges = list(self.sanatize_tag_id_matches(matches))
 
-            if tag_id.isdigit():
-                tag_id = int(tag_id)
-
-            assert tag_id in mt940.tags.TAG_BY_ID, 'Unknown tag %r ' \
-                'in line: %r' % (tag_id, match.group(0))
+        for i, tr in enumerate(valid_match_ranges):
+            match = matches[tr['start']]
+            tag_id = tr['tag_id'];
 
+            # get tag instance corresponding to tag id
+            
             tag = mt940.tags.TAG_BY_ID.get(match.group('full_tag')) \
                 or mt940.tags.TAG_BY_ID[tag_id]
 
@@ -351,8 +392,8 @@ def parse(self, data):
             # regex matches have a `end()` and `start()` to indicate the start
             # and end index of the match.
 
-            if matches[i + 1:]:
-                tag_data = data[match.end():matches[i + 1].start()].strip()
+            if valid_match_ranges[i + 1:]:
+                tag_data = data[match.end():matches[valid_match_ranges[i + 1]['start']].start()].strip()
             else:
                 tag_data = data[match.end():].strip()
 

From 127b978a1dd6f54d36d70b3f6e38fd388aab4b56 Mon Sep 17 00:00:00 2001
From: dr-duplo <monsieur.cm@gmx.de>
Date: Mon, 29 Jan 2018 21:15:54 +0100
Subject: [PATCH 2/5] Simplify tag validation

---
 mt940/models.py | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/mt940/models.py b/mt940/models.py
index 43f8f56..f3c47d1 100644
--- a/mt940/models.py
+++ b/mt940/models.py
@@ -324,10 +324,10 @@ def normalize_tag_id(cls, tag_id):
         return tag_id
 
     @classmethod
-    def sanatize_tag_id_matches(cls, tag_matches):
+    def sanatize_tag_id_matches(cls, matches):
         i_next = 0
-        for i, match in enumerate(tag_matches):
-            # tag match was rejected
+        for i, match in enumerate(matches):
+            # match was rejected
             if i < i_next:
                 continue
 
@@ -341,21 +341,22 @@ def sanatize_tag_id_matches(cls, tag_matches):
             assert tag_id in mt940.tags.TAG_BY_ID, 'Unknown tag %r ' \
                 'in line: %r' % (tag_id, match.group(0))
 
-            # special treatnment for long tag content with possible
+            # special treatment for long tag content with possible
             # bad line wrap which produces tag_id like line beginnings
-            # seen in :86: tag
+            # seen with :86: tag
             if tag_id == mt940.tags.Tags.TRANSACTION_DETAILS.value.id:
                 # search subsequent tags for unknown tag ids
                 # these lines likely belong to the previous tag
-                for j in range(i+1, len(tag_matches)):
-                    next_tag_id = cls.normalize_tag_id(tag_matches[j].group('tag'))
+                for j in range(i_next, len(matches)):
+                    next_tag_id = cls.normalize_tag_id(matches[j].group('tag'))
                     if next_tag_id in mt940.tags.TAG_BY_ID:
-                        # extend range to found valid tag
+                        # this one is the next valid match
                         i_next = j
                         break;
-
-            # a valid tag match
-            yield { 'tag_id': tag_id , 'start': i, 'end': i_next - 1 }                
+                    # else reject match
+                    
+            # a valid match
+            yield match               
                 
     def parse(self, data):
         '''Parses mt940 data, expects a string with data
@@ -377,14 +378,12 @@ def parse(self, data):
         matches = list(tag_re.finditer(data))
 
         # identify valid matches
-        valid_match_ranges = list(self.sanatize_tag_id_matches(matches))
+        valid_matches = list(self.sanatize_tag_id_matches(matches))
 
-        for i, tr in enumerate(valid_match_ranges):
-            match = matches[tr['start']]
-            tag_id = tr['tag_id'];
+        for i, match in enumerate(valid_matches):
+            tag_id = self.normalize_tag_id(match.group('tag'))
 
             # get tag instance corresponding to tag id
-            
             tag = mt940.tags.TAG_BY_ID.get(match.group('full_tag')) \
                 or mt940.tags.TAG_BY_ID[tag_id]
 
@@ -392,8 +391,8 @@ def parse(self, data):
             # regex matches have a `end()` and `start()` to indicate the start
             # and end index of the match.
 
-            if valid_match_ranges[i + 1:]:
-                tag_data = data[match.end():matches[valid_match_ranges[i + 1]['start']].start()].strip()
+            if valid_matches[i + 1:]:
+                tag_data = data[match.end():valid_matches[i + 1].start()].strip()
             else:
                 tag_data = data[match.end():].strip()
 

From ad0ed6464db7f6f595b6cbc203f8b0a69ace71ab Mon Sep 17 00:00:00 2001
From: dr-duplo <monsieur.cm@gmx.de>
Date: Mon, 29 Jan 2018 21:29:17 +0100
Subject: [PATCH 3/5] Fix build errors

---
 mt940/models.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mt940/models.py b/mt940/models.py
index f3c47d1..b31aa85 100644
--- a/mt940/models.py
+++ b/mt940/models.py
@@ -352,12 +352,12 @@ def sanatize_tag_id_matches(cls, matches):
                     if next_tag_id in mt940.tags.TAG_BY_ID:
                         # this one is the next valid match
                         i_next = j
-                        break;
+                        break
                     # else reject match
-                    
+
             # a valid match
-            yield match               
-                
+            yield match
+
     def parse(self, data):
         '''Parses mt940 data, expects a string with data
 
@@ -392,7 +392,8 @@ def parse(self, data):
             # and end index of the match.
 
             if valid_matches[i + 1:]:
-                tag_data = data[match.end():valid_matches[i + 1].start()].strip()
+                tag_data = 
+                    data[match.end():valid_matches[i + 1].start()].strip()
             else:
                 tag_data = data[match.end():].strip()
 

From d81a7e35741c85ade9bd3bc12390364cff44a128 Mon Sep 17 00:00:00 2001
From: dr-duplo <monsieur.cm@gmx.de>
Date: Mon, 29 Jan 2018 21:33:05 +0100
Subject: [PATCH 4/5] Fix build again

---
 mt940/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mt940/models.py b/mt940/models.py
index b31aa85..0bc3944 100644
--- a/mt940/models.py
+++ b/mt940/models.py
@@ -392,7 +392,7 @@ def parse(self, data):
             # and end index of the match.
 
             if valid_matches[i + 1:]:
-                tag_data = 
+                tag_data = \
                     data[match.end():valid_matches[i + 1].start()].strip()
             else:
                 tag_data = data[match.end():].strip()

From 45f4d4e458ab81082a06a5aee54870cb3f766d4d Mon Sep 17 00:00:00 2001
From: dr-duplo <monsieur.cm@gmx.de>
Date: Mon, 29 Jan 2018 22:13:40 +0100
Subject: [PATCH 5/5] Add test data for transaction details wrap

---
 .../transaction_details_wrapped.sta           | 11 +++
 .../transaction_details_wrapped.yml           | 91 +++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 tests/self-provided/transaction_details_wrapped.sta
 create mode 100644 tests/self-provided/transaction_details_wrapped.yml

diff --git a/tests/self-provided/transaction_details_wrapped.sta b/tests/self-provided/transaction_details_wrapped.sta
new file mode 100644
index 0000000..b450135
--- /dev/null
+++ b/tests/self-provided/transaction_details_wrapped.sta
@@ -0,0 +1,11 @@
+
+:20:STARTUMSE
+:25:12345678/1020304050
+:28C:00000/001
+:60F:C160229EUR1200,00
+:61:1602300301DR6,00N024NONREF
+:86:805?00ENTGELTABSCHLUSS?106666?20Pauschalen?3012345678?1122334
+45566?602017-01-01T13
+:12:11
+:62F:C160301EUR1194,00
+-
diff --git a/tests/self-provided/transaction_details_wrapped.yml b/tests/self-provided/transaction_details_wrapped.yml
new file mode 100644
index 0000000..7564caf
--- /dev/null
+++ b/tests/self-provided/transaction_details_wrapped.yml
@@ -0,0 +1,91 @@
+  &id001 !!python/object:mt940.models.Transactions
+data:
+  account_identification: 12345678/1020304050
+  final_closing_balance: !!python/object:mt940.models.Balance
+    amount: !!python/object:mt940.models.Amount
+      amount: !!python/object/apply:decimal.Decimal ['1194.00']
+      currency: EUR
+    date: !!python/object/apply:mt940.models.Date
+    - !!binary |
+      B+ADAQ==
+    status: C
+  final_opening_balance: !!python/object:mt940.models.Balance
+    amount: !!python/object:mt940.models.Amount
+      amount: !!python/object/apply:decimal.Decimal ['1200.00']
+      currency: EUR
+    date: !!python/object/apply:mt940.models.Date
+    - !!binary |
+      B+ACHQ==
+    status: C
+  sequence_number: '001'
+  statement_number: '00000'
+  transaction_reference: STARTUMSE
+processors:
+  post_account_identification: []
+  post_available_balance: []
+  post_closing_balance: []
+  post_date_time_indication: []
+  post_final_closing_balance: []
+  post_final_opening_balance: []
+  post_floor_limit_indicator: []
+  post_forward_available_balance: []
+  post_intermediate_closing_balance: []
+  post_intermediate_opening_balance: []
+  post_non_swift: []
+  post_opening_balance: []
+  post_related_reference: []
+  post_statement: [!!python/name:mt940.processors.date_cleanup_post_processor '']
+  post_statement_number: []
+  post_sum_credit_entries: []
+  post_sum_debit_entries: []
+  post_transaction_details: [!!python/name:mt940.processors.transaction_details_post_processor '']
+  post_transaction_reference_number: []
+  pre_account_identification: []
+  pre_available_balance: []
+  pre_closing_balance: []
+  pre_date_time_indication: []
+  pre_final_closing_balance: []
+  pre_final_opening_balance: []
+  pre_floor_limit_indicator: []
+  pre_forward_available_balance: []
+  pre_intermediate_closing_balance: []
+  pre_intermediate_opening_balance: []
+  pre_non_swift: []
+  pre_opening_balance: []
+  pre_related_reference: []
+  pre_statement: [!!python/name:mt940.processors.date_fixup_pre_processor '']
+  pre_statement_number: []
+  pre_sum_credit_entries: []
+  pre_sum_debit_entries: []
+  pre_transaction_details: []
+  pre_transaction_reference_number: []
+transactions:
+- !!python/object:mt940.models.Transaction
+  data:
+    additional_purpose: '2017-01-01T13:12:11'
+    amount: !!python/object:mt940.models.Amount
+      amount: !!python/object/apply:decimal.Decimal ['-6.00']
+      currency: EUR
+    applicant_bin: '12345678'
+    applicant_iban: null
+    applicant_name: null
+    bank_reference: null
+    currency: EUR
+    customer_reference: NONREF
+    date: !!python/object/apply:mt940.models.Date
+    - !!binary |
+      B+ACHQ==
+    entry_date: !!python/object/apply:mt940.models.Date
+    - !!binary |
+      B+ADAQ==
+    extra_details: ''
+    funds_code: R
+    id: N024
+    posting_text: ENTGELTABSCHLUSS
+    prima_nota: '6666'
+    purpose: Pauschalen
+    recipient_name: null
+    return_debit_notes: null
+    status: D
+    transaction_code: '805'
+  transactions: *id001