In [22]:
# download presidio
# ! pip install presidio_analyzer presidio_anonymizer
# ! python -m spacy download en_core_web_lg
! python -m spacy download zh_core_web_lg

Collecting zh-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_lg-3.8.0/zh_core_web_lg-3.8.0-py3-none-any.whl (603.0 MB)
     ---------------------------------------- 0.0/603.0 MB ? eta -:--:--
      ------------------------------------ 10.5/603.0 MB 217.5 MB/s eta 0:00:03
     - ------------------------------------ 21.0/603.0 MB 63.1 MB/s eta 0:00:10
     - ------------------------------------ 28.6/603.0 MB 46.4 MB/s eta 0:00:13
     - ------------------------------------ 31.5/603.0 MB 51.1 MB/s eta 0:00:12
     -- ----------------------------------- 41.9/603.0 MB 46.8 MB/s eta 0:00:12
     --- ---------------------------------- 52.4/603.0 MB 44.5 MB/s eta 0:00:13
     --- ---------------------------------- 52.7/603.0 MB 36.9 MB/s eta 0:00:15
     --- ---------------------------------- 62.9/603.0 MB 43.1 MB/s eta 0:00:13
     ---- --------------------------------- 73.4/603.0 MB 42.2 MB/s eta 0:00:13
     ----- -------------------

In [23]:
import spacy
spacy.load('zh_core_web_lg')

<spacy.lang.zh.Chinese at 0x20203e3db20>

In [24]:
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import json
from pprint import pprint

In [39]:
text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"
# text_to_anonymize = "他的名字是张三，电话号码是 212-555-5555"

In [40]:
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER","PERSON"], language='en')

print(analyzer_results)

[type: PERSON, start: 16, end: 21, score: 0.85, type: PHONE_NUMBER, start: 46, end: 58, score: 0.75]


In [5]:
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=["Mr.","Mrs.","Miss"])

pronoun_recognizer = PatternRecognizer(supported_entity="PRONOUN",
                                       deny_list=["he", "He", "his", "His", "she", "She", "hers", "Hers"])

analyzer.registry.add_recognizer(titles_recognizer)
analyzer.registry.add_recognizer(pronoun_recognizer)

analyzer_results = analyzer.analyze(text=text_to_anonymize,
                            entities=["TITLE", "PRONOUN"],
                            language="en")
print(analyzer_results)


[type: PRONOUN, start: 0, end: 3, score: 1.0, type: TITLE, start: 12, end: 15, score: 1.0, type: PRONOUN, start: 26, end: 29, score: 1.0]


In [6]:
analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')

analyzer_results

[type: PRONOUN, start: 0, end: 3, score: 1.0,
 type: TITLE, start: 12, end: 15, score: 1.0,
 type: PRONOUN, start: 26, end: 29, score: 1.0,
 type: PERSON, start: 16, end: 21, score: 0.85,
 type: PHONE_NUMBER, start: 46, end: 58, score: 0.75]

In [13]:
anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,    
    operators={"DEFAULT": OperatorConfig("replace", {"new_value": ""}), 
                        "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 7, "from_end" : False}),
                        "TITLE": OperatorConfig("redact", {})}
)

print(f"text: {anonymized_results.text}")
print("detailed response:")

pprint(json.loads(anonymized_results.to_json()))

text: <PRONOUN> name is  <PERSON> and <PRONOUN> phone number is *******-5555
detailed response:
{'items': [{'end': 70,
            'entity_type': 'PHONE_NUMBER',
            'operator': 'mask',
            'start': 58,
            'text': '*******-5555'},
           {'end': 41,
            'entity_type': 'PRONOUN',
            'operator': 'replace',
            'start': 32,
            'text': '<PRONOUN>'},
           {'end': 27,
            'entity_type': 'PERSON',
            'operator': 'replace',
            'start': 19,
            'text': '<PERSON>'},
           {'end': 18,
            'entity_type': 'TITLE',
            'operator': 'redact',
            'start': 18,
            'text': ''},
           {'end': 9,
            'entity_type': 'PRONOUN',
            'operator': 'replace',
            'start': 0,
            'text': '<PRONOUN>'}],
 'text': '<PRONOUN> name is  <PERSON> and <PRONOUN> phone number is '
         '*******-5555'}
