Merge pull request #89 from vmenger/improve-phone-number-detection

Improve phone number detection
vmenger · Aug 1, 2023 · 9b7c969 · 9b7c969
2 parents a63eec4 + 9b58bb7
commit 9b7c969
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - by default, deduce now recognizes all 7+ digit numbers as identifiers
 - improved regular expressions for e-mail address and url matching
 - separate tags for emails and urls
+- logic for detecting phone numbers (improvements for hyphens, whitespaces, false positive identifiers)
 - improved regular expression for age matching
 
 ### Removed

diff --git a/config.json b/config.json
@@ -159,22 +159,6 @@
                 ]
             }
         },
-        "institution": {
-            "annotator_type": "multi_token",
-            "group": "institutions",
-            "args": {
-                "lookup_values": "institutions",
-                "tag": "instelling"
-            }
-        },
-        "altrecht": {
-            "annotator_type": "regexp",
-            "group": "institutions",
-            "args": {
-                "regexp_pattern": "[aA][lL][tT][rR][eE][cC][hH][tT]((\\s[A-Z][\\w]*)*)",
-                "tag": "instelling"
-            }
-        },
         "residence": {
             "annotator_type": "multi_token",
             "group": "locations",
@@ -209,28 +193,20 @@
                 "tag": "locatie"
             }
         },
-        "phone_1": {
-            "annotator_type": "regexp",
-            "group": "phone_numbers",
-            "args": {
-                "regexp_pattern": "(((0)[1-9]{2}[0-9][-]?[1-9][0-9]{5})|((\\+31|0|0031)[1-9][0-9][-]?[1-9][0-9]{6}))",
-                "tag": "telefoonnummer"
-            }
-        },
-        "phone_2": {
-            "annotator_type": "regexp",
-            "group": "phone_numbers",
+        "institution": {
+            "annotator_type": "multi_token",
+            "group": "institutions",
             "args": {
-                "regexp_pattern": "(((\\+31|0|0031)6)[-]?[1-9][0-9]{7})",
-                "tag": "telefoonnummer"
+                "lookup_values": "institutions",
+                "tag": "instelling"
             }
         },
-        "phone_3": {
+        "altrecht": {
             "annotator_type": "regexp",
-            "group": "phones",
+            "group": "institutions",
             "args": {
-                "regexp_pattern": "((\\(\\d{3}\\)|\\d{3})\\s?\\d{3}\\s?\\d{2}\\s?\\d{2})",
-                "tag": "telefoonnummer"
+                "regexp_pattern": "[aA][lL][tT][rR][eE][cC][hH][tT]((\\s[A-Z][\\w]*)*)",
+                "tag": "instelling"
             }
         },
         "date_1": {
@@ -251,6 +227,15 @@
                 "capturing_group": 1
             }
         },
+        "age": {
+            "annotator_type": "regexp",
+            "group": "ages",
+            "args": {
+                "regexp_pattern": "(?i)(?<!(policontrole) )(?<!(gedurende) )(?<!(controle|onder de) )(?<!(sinds|om de|in de) )(?<!(over|elke) )(?<!(nog) )(?<!(na|op|al) )(?<!(<) )(?<!\\d-)(?<!\\d\\d-)(?<!\\d)(?<!\\d[,.])((((\\d-|\\d\\d-))?(\\d[,\\.]\\d|\\d{1,3}))([ -](jarige|jarig|jaar)))(?!\\w)(?! (geleden|na|aanwezig|getrouwd|gestopt|gerookt|gebruikt|gestaakt))",
+                "tag": "leeftijd",
+                "capturing_group": 10
+            }
+        },
         "identifier": {
             "annotator_type": "regexp",
             "group": "identifiers",
@@ -259,13 +244,16 @@
                 "tag": "id"
             }
         },
-        "age": {
-            "annotator_type": "regexp",
-            "group": "ages",
+        "phone": {
+            "annotator_type": "custom",
+            "group": "phone_numbers",
             "args": {
-                "regexp_pattern": "(?i)(?<!(policontrole) )(?<!(gedurende) )(?<!(controle|onder de) )(?<!(sinds|om de|in de) )(?<!(over|elke) )(?<!(nog) )(?<!(na|op|al) )(?<!(<) )(?<!\\d-)(?<!\\d\\d-)(?<!\\d)(?<!\\d[,.])((((\\d-|\\d\\d-))?(\\d[,\\.]\\d|\\d{1,3}))([ -](jarige|jarig|jaar)))(?!\\w)(?! (geleden|na|aanwezig|getrouwd|gestopt|gerookt|gebruikt|gestaakt))",
-                "tag": "leeftijd",
-                "capturing_group": 10
+                "module": "deduce.process.annotator",
+                "class": "PhoneNumberAnnotator",
+                "phone_regexp": "(^|(?<!\\d))(\\(?(0031|\\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\\d{2})\\)?) ?-? ?((\\d{2,4}[ -]?)+\\d{2,4})",
+                "min_digits": 9,
+                "max_digits": 11,
+                "tag": "telefoonnummer"
             }
         },
         "email": {

diff --git a/deduce/process/annotator.py b/deduce/process/annotator.py
@@ -147,3 +147,52 @@ def annotate(self, doc: Document) -> list[Annotation]:
                 annotations.append(Annotation(text=text, start_char=start, end_char=end, tag=self.tag))
 
         return annotations
+
+
+class PhoneNumberAnnotator(dd.process.Annotator):
+    """Annotates phone numbers."""
+
+    def __init__(self, phone_regexp: str, *args, min_digits: int = 9, max_digits: int = 11, **kwargs) -> None:
+
+        self.phone_regexp = re.compile(phone_regexp)
+        self.min_digits = min_digits
+        self.max_digits = max_digits
+
+        super().__init__(*args, **kwargs)
+
+    def annotate(self, doc: Document) -> list[Annotation]:
+
+        annotations = []
+
+        for match in self.phone_regexp.finditer(doc.text):
+
+            digit_len_shift = 0
+            left_index_shift = 0
+            prefix_with_parens = match.group(2)
+            prefix_digits = "0" + re.sub(r"\D", "", match.group(4))
+            number_digits = re.sub(r"\D", "", match.group(5))
+
+            # Trim parenthesis
+            if prefix_with_parens.startswith("(") and not prefix_with_parens.endswith(")"):
+                left_index_shift = 1
+
+            # Check max 1 hyphen
+            if len(re.findall("-", match.group(0))) > 1:
+                continue
+
+            # Shift num digits for shorter numbers
+            if prefix_digits in ["0800", "0900", "0906", "0909"]:
+                digit_len_shift = -2
+
+            if (
+                (self.min_digits + digit_len_shift)
+                <= (len(prefix_digits) + len(number_digits))
+                <= (self.max_digits + digit_len_shift)
+            ):
+                text = match.group(0)[left_index_shift:]
+                start_char, end_char = match.span(0)
+                start_char += left_index_shift
+
+                annotations.append(Annotation(text=text, start_char=start_char, end_char=end_char, tag=self.tag))
+
+        return annotations
diff --git a/docs/source/tutorial.md b/docs/source/tutorial.md
@@ -37,17 +37,14 @@ The `Annotator` is responsible for tagging pieces of information in the text as
 |                 | street_with_number       | regexp             | Street names, with optionally a house number                                               |
 |                 | postal_code              | regexp             | Postal codes                                                                               |
 |                 | postbus                  | regexp             | Postbussen                                                                                 |
-| phone_numbers   | phone_1                  | regexp             | Phone numbers (pattern 1)                                                                  |
-|                 | phone_2                  | regexp             | Phone numbers (pattern 2)                                                                  |
-|                 | phone_3                  | regexp             | Phone numbers (pattern 3)                                                                  |
-| identifiers     | identifier               | regexp             | Identifiers (7+ digit numbers)                                                             |
 | dates           | date_1                   | regexp             | Dates (pattern 1)                                                                          |
 |                 | date_2                   | regexp             | Dates (pattern 2)                                                                          |
 | ages            | age                      | regexp             | Ages                                                                                       |
+| identifiers     | identifier               | regexp             | Identifiers (7+ digit numbers)                                                             |
+| bsn             | bsn                      | custom             | BSN-numbers (9 digits + specific 'elfproef')                                               |
+| phone_numbers   | phone                    | regexp             | Phone numbers                                                                              |
 | email_addresses | email                    | regexp             | E-mail addresses                                                                           |
 | urls            | url                      | regexp             | URLs                                                                                       |
-| bsn             | bsn                      | custom             | BSN-numbers (9 digits + specific 'elfproef')                                               |
-
 
 It's possible to add, remove, apply subsets or implement custom annotators, those options are described further down under [customizing deduce](#customizing-deduce). 
 

diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
@@ -4,7 +4,11 @@
 import pytest
 
 from deduce.pattern.name_context import AnnotationContextPattern
-from deduce.process.annotator import AnnotationContextPatternAnnotator, BsnAnnotator
+from deduce.process.annotator import (
+    AnnotationContextPatternAnnotator,
+    BsnAnnotator,
+    PhoneNumberAnnotator,
+)
 from tests.helpers import link_tokens
 
 
@@ -18,6 +22,17 @@ def bsn_doc():
     )
 
 
+@pytest.fixture
+def phone_number_doc():
+
+    d = dd.DocDeid()
+
+    return d.deidentify(
+        text="Telefoonnummers zijn 0314-555555, (088 755 55 55) of (06)55555555, maar 065555 is "
+        "te kort en 065555555555 is te lang. Verwijsnummer is 0800-9003."
+    )
+
+
 class ExtendCapitalContextPattern(AnnotationContextPattern):
     def annotation_precondition(self, annotation: dd.Annotation) -> bool:
         return annotation.end_token.next() is not None
@@ -290,3 +305,59 @@ def test_annotate(self, bsn_doc):
         ]
 
         assert annotations == expected_annotations
+
+
+class TestPhoneNumberAnnotator:
+    def test_annotate_defaults(self, phone_number_doc):
+
+        an = PhoneNumberAnnotator(
+            phone_regexp=r"(^|(?<!\d))"
+            r"(\(?(0031|\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\d{2})\)?)"
+            r" ?-? ?"
+            r"((\d{2,4}[ -]?)+\d{2,4})",
+            tag="_",
+        )
+        annotations = an.annotate(phone_number_doc)
+
+        expected_annotations = [
+            dd.Annotation(text="0314-555555", start_char=21, end_char=32, tag="_"),
+            dd.Annotation(text="088 755 55 55", start_char=35, end_char=48, tag="_"),
+            dd.Annotation(text="(06)55555555", start_char=53, end_char=65, tag="_"),
+            dd.Annotation(text="0800-9003", start_char=135, end_char=144, tag="_"),
+        ]
+
+        assert annotations == expected_annotations
+
+    def test_annotate_short(self, phone_number_doc):
+
+        an = PhoneNumberAnnotator(
+            phone_regexp=r"(^|(?<!\d))"
+            r"(\(?(0031|\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\d{2})\)?)"
+            r" ?-? ?"
+            r"((\d{2,4}[ -]?)+\d{2,4})",
+            min_digits=4,
+            max_digits=8,
+            tag="_",
+        )
+        annotations = an.annotate(phone_number_doc)
+
+        expected_annotations = [dd.Annotation(text="065555", start_char=72, end_char=78, tag="_")]
+
+        assert annotations == expected_annotations
+
+    def test_annotate_long(self, phone_number_doc):
+
+        an = PhoneNumberAnnotator(
+            phone_regexp=r"(^|(?<!\d))"
+            r"(\(?(0031|\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\d{2})\)?)"
+            r" ?-? ?"
+            r"((\d{2,4}[ -]?)+\d{2,4})",
+            min_digits=11,
+            max_digits=12,
+            tag="_",
+        )
+        annotations = an.annotate(phone_number_doc)
+
+        expected_annotations = [dd.Annotation(text="065555555555", start_char=93, end_char=105, tag="_")]
+
+        assert annotations == expected_annotations
diff --git a/tests/unit/test_deduce_processors.py b/tests/unit/test_deduce_processors.py
@@ -141,9 +141,7 @@ def test_annotate_phone_number(self):
         text = "088-7555555, 088-1309670"
 
         annotator = [
-            get_annotator("phone_1", group="phone_numbers"),
-            get_annotator("phone_2", group="phone_numbers"),
-            get_annotator("phone_2", group="phone_numbers"),
+            get_annotator("phone", group="phone_numbers"),
         ]
         expected_annotations = {
             dd.Annotation(text="088-7555555", start_char=0, end_char=11, tag=annotator[0].tag),