Skip to content

Commit

Permalink
Merge pull request #89 from vmenger/improve-phone-number-detection
Browse files Browse the repository at this point in the history
Improve phone number detection
  • Loading branch information
vmenger committed Aug 1, 2023
2 parents a63eec4 + 9b58bb7 commit 9b7c969
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- by default, deduce now recognizes all 7+ digit numbers as identifiers
- improved regular expressions for e-mail address and url matching
- separate tags for emails and urls
- logic for detecting phone numbers (improvements for hyphens, whitespaces, false positive identifiers)
- improved regular expression for age matching

### Removed
Expand Down
66 changes: 27 additions & 39 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,6 @@
]
}
},
"institution": {
"annotator_type": "multi_token",
"group": "institutions",
"args": {
"lookup_values": "institutions",
"tag": "instelling"
}
},
"altrecht": {
"annotator_type": "regexp",
"group": "institutions",
"args": {
"regexp_pattern": "[aA][lL][tT][rR][eE][cC][hH][tT]((\\s[A-Z][\\w]*)*)",
"tag": "instelling"
}
},
"residence": {
"annotator_type": "multi_token",
"group": "locations",
Expand Down Expand Up @@ -209,28 +193,20 @@
"tag": "locatie"
}
},
"phone_1": {
"annotator_type": "regexp",
"group": "phone_numbers",
"args": {
"regexp_pattern": "(((0)[1-9]{2}[0-9][-]?[1-9][0-9]{5})|((\\+31|0|0031)[1-9][0-9][-]?[1-9][0-9]{6}))",
"tag": "telefoonnummer"
}
},
"phone_2": {
"annotator_type": "regexp",
"group": "phone_numbers",
"institution": {
"annotator_type": "multi_token",
"group": "institutions",
"args": {
"regexp_pattern": "(((\\+31|0|0031)6)[-]?[1-9][0-9]{7})",
"tag": "telefoonnummer"
"lookup_values": "institutions",
"tag": "instelling"
}
},
"phone_3": {
"altrecht": {
"annotator_type": "regexp",
"group": "phones",
"group": "institutions",
"args": {
"regexp_pattern": "((\\(\\d{3}\\)|\\d{3})\\s?\\d{3}\\s?\\d{2}\\s?\\d{2})",
"tag": "telefoonnummer"
"regexp_pattern": "[aA][lL][tT][rR][eE][cC][hH][tT]((\\s[A-Z][\\w]*)*)",
"tag": "instelling"
}
},
"date_1": {
Expand All @@ -251,6 +227,15 @@
"capturing_group": 1
}
},
"age": {
"annotator_type": "regexp",
"group": "ages",
"args": {
"regexp_pattern": "(?i)(?<!(policontrole) )(?<!(gedurende) )(?<!(controle|onder de) )(?<!(sinds|om de|in de) )(?<!(over|elke) )(?<!(nog) )(?<!(na|op|al) )(?<!(<) )(?<!\\d-)(?<!\\d\\d-)(?<!\\d)(?<!\\d[,.])((((\\d-|\\d\\d-))?(\\d[,\\.]\\d|\\d{1,3}))([ -](jarige|jarig|jaar)))(?!\\w)(?! (geleden|na|aanwezig|getrouwd|gestopt|gerookt|gebruikt|gestaakt))",
"tag": "leeftijd",
"capturing_group": 10
}
},
"identifier": {
"annotator_type": "regexp",
"group": "identifiers",
Expand All @@ -259,13 +244,16 @@
"tag": "id"
}
},
"age": {
"annotator_type": "regexp",
"group": "ages",
"phone": {
"annotator_type": "custom",
"group": "phone_numbers",
"args": {
"regexp_pattern": "(?i)(?<!(policontrole) )(?<!(gedurende) )(?<!(controle|onder de) )(?<!(sinds|om de|in de) )(?<!(over|elke) )(?<!(nog) )(?<!(na|op|al) )(?<!(<) )(?<!\\d-)(?<!\\d\\d-)(?<!\\d)(?<!\\d[,.])((((\\d-|\\d\\d-))?(\\d[,\\.]\\d|\\d{1,3}))([ -](jarige|jarig|jaar)))(?!\\w)(?! (geleden|na|aanwezig|getrouwd|gestopt|gerookt|gebruikt|gestaakt))",
"tag": "leeftijd",
"capturing_group": 10
"module": "deduce.process.annotator",
"class": "PhoneNumberAnnotator",
"phone_regexp": "(^|(?<!\\d))(\\(?(0031|\\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\\d{2})\\)?) ?-? ?((\\d{2,4}[ -]?)+\\d{2,4})",
"min_digits": 9,
"max_digits": 11,
"tag": "telefoonnummer"
}
},
"email": {
Expand Down
49 changes: 49 additions & 0 deletions deduce/process/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,52 @@ def annotate(self, doc: Document) -> list[Annotation]:
annotations.append(Annotation(text=text, start_char=start, end_char=end, tag=self.tag))

return annotations


class PhoneNumberAnnotator(dd.process.Annotator):
"""Annotates phone numbers."""

def __init__(self, phone_regexp: str, *args, min_digits: int = 9, max_digits: int = 11, **kwargs) -> None:

self.phone_regexp = re.compile(phone_regexp)
self.min_digits = min_digits
self.max_digits = max_digits

super().__init__(*args, **kwargs)

def annotate(self, doc: Document) -> list[Annotation]:

annotations = []

for match in self.phone_regexp.finditer(doc.text):

digit_len_shift = 0
left_index_shift = 0
prefix_with_parens = match.group(2)
prefix_digits = "0" + re.sub(r"\D", "", match.group(4))
number_digits = re.sub(r"\D", "", match.group(5))

# Trim parenthesis
if prefix_with_parens.startswith("(") and not prefix_with_parens.endswith(")"):
left_index_shift = 1

# Check max 1 hyphen
if len(re.findall("-", match.group(0))) > 1:
continue

# Shift num digits for shorter numbers
if prefix_digits in ["0800", "0900", "0906", "0909"]:
digit_len_shift = -2

if (
(self.min_digits + digit_len_shift)
<= (len(prefix_digits) + len(number_digits))
<= (self.max_digits + digit_len_shift)
):
text = match.group(0)[left_index_shift:]
start_char, end_char = match.span(0)
start_char += left_index_shift

annotations.append(Annotation(text=text, start_char=start_char, end_char=end_char, tag=self.tag))

return annotations
9 changes: 3 additions & 6 deletions docs/source/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,14 @@ The `Annotator` is responsible for tagging pieces of information in the text as
| | street_with_number | regexp | Street names, with optionally a house number |
| | postal_code | regexp | Postal codes |
| | postbus | regexp | Postbussen |
| phone_numbers | phone_1 | regexp | Phone numbers (pattern 1) |
| | phone_2 | regexp | Phone numbers (pattern 2) |
| | phone_3 | regexp | Phone numbers (pattern 3) |
| identifiers | identifier | regexp | Identifiers (7+ digit numbers) |
| dates | date_1 | regexp | Dates (pattern 1) |
| | date_2 | regexp | Dates (pattern 2) |
| ages | age | regexp | Ages |
| identifiers | identifier | regexp | Identifiers (7+ digit numbers) |
| bsn | bsn | custom | BSN-numbers (9 digits + specific 'elfproef') |
| phone_numbers | phone | regexp | Phone numbers |
| email_addresses | email | regexp | E-mail addresses |
| urls | url | regexp | URLs |
| bsn | bsn | custom | BSN-numbers (9 digits + specific 'elfproef') |


It's possible to add, remove, apply subsets or implement custom annotators, those options are described further down under [customizing deduce](#customizing-deduce).

Expand Down
73 changes: 72 additions & 1 deletion tests/unit/process/test_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import pytest

from deduce.pattern.name_context import AnnotationContextPattern
from deduce.process.annotator import AnnotationContextPatternAnnotator, BsnAnnotator
from deduce.process.annotator import (
AnnotationContextPatternAnnotator,
BsnAnnotator,
PhoneNumberAnnotator,
)
from tests.helpers import link_tokens


Expand All @@ -18,6 +22,17 @@ def bsn_doc():
)


@pytest.fixture
def phone_number_doc():

d = dd.DocDeid()

return d.deidentify(
text="Telefoonnummers zijn 0314-555555, (088 755 55 55) of (06)55555555, maar 065555 is "
"te kort en 065555555555 is te lang. Verwijsnummer is 0800-9003."
)


class ExtendCapitalContextPattern(AnnotationContextPattern):
def annotation_precondition(self, annotation: dd.Annotation) -> bool:
return annotation.end_token.next() is not None
Expand Down Expand Up @@ -290,3 +305,59 @@ def test_annotate(self, bsn_doc):
]

assert annotations == expected_annotations


class TestPhoneNumberAnnotator:
def test_annotate_defaults(self, phone_number_doc):

an = PhoneNumberAnnotator(
phone_regexp=r"(^|(?<!\d))"
r"(\(?(0031|\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\d{2})\)?)"
r" ?-? ?"
r"((\d{2,4}[ -]?)+\d{2,4})",
tag="_",
)
annotations = an.annotate(phone_number_doc)

expected_annotations = [
dd.Annotation(text="0314-555555", start_char=21, end_char=32, tag="_"),
dd.Annotation(text="088 755 55 55", start_char=35, end_char=48, tag="_"),
dd.Annotation(text="(06)55555555", start_char=53, end_char=65, tag="_"),
dd.Annotation(text="0800-9003", start_char=135, end_char=144, tag="_"),
]

assert annotations == expected_annotations

def test_annotate_short(self, phone_number_doc):

an = PhoneNumberAnnotator(
phone_regexp=r"(^|(?<!\d))"
r"(\(?(0031|\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\d{2})\)?)"
r" ?-? ?"
r"((\d{2,4}[ -]?)+\d{2,4})",
min_digits=4,
max_digits=8,
tag="_",
)
annotations = an.annotate(phone_number_doc)

expected_annotations = [dd.Annotation(text="065555", start_char=72, end_char=78, tag="_")]

assert annotations == expected_annotations

def test_annotate_long(self, phone_number_doc):

an = PhoneNumberAnnotator(
phone_regexp=r"(^|(?<!\d))"
r"(\(?(0031|\+31|0)(1[035]|2[0347]|3[03568]|4[03456]|5[0358]|6|7|88|800|91|90[069]|[1-5]\d{2})\)?)"
r" ?-? ?"
r"((\d{2,4}[ -]?)+\d{2,4})",
min_digits=11,
max_digits=12,
tag="_",
)
annotations = an.annotate(phone_number_doc)

expected_annotations = [dd.Annotation(text="065555555555", start_char=93, end_char=105, tag="_")]

assert annotations == expected_annotations
4 changes: 1 addition & 3 deletions tests/unit/test_deduce_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,7 @@ def test_annotate_phone_number(self):
text = "088-7555555, 088-1309670"

annotator = [
get_annotator("phone_1", group="phone_numbers"),
get_annotator("phone_2", group="phone_numbers"),
get_annotator("phone_2", group="phone_numbers"),
get_annotator("phone", group="phone_numbers"),
]
expected_annotations = {
dd.Annotation(text="088-7555555", start_char=0, end_char=11, tag=annotator[0].tag),
Expand Down

0 comments on commit 9b7c969

Please sign in to comment.