## 隐私数据匿名化

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
import spacy
from spacy.language import Language

nlp = spacy.load('en_core_web_sm')

@Language.component("anonymize_entities")
def anonymize_entities(doc):
    anonymized_tokens = []
    for token in doc:
        if token.ent_type_:
            anonymized_tokens.append('#' + token.ent_type_ + '#' * (len(token.text) - len(token.ent_type_) - 1))
        else:
            anonymized_tokens.append(token.text)
    return nlp.make_doc(' '.join(anonymized_tokens))  # create new Doc object

nlp.add_pipe("anonymize_entities", last=True)

texts = ['John Doe lives in New York and works at Google.', 
         'Alice is studying at the University of Oxford.']

for text in texts:
    doc = nlp(text)
    print('Anonymized Text:', doc.text)

Anonymized Text: #PERSON #PERSON lives in #GPE #GPE and works at #ORG## .
Anonymized Text: #PERSON is studying at #ORG #ORG###### #ORG #ORG## .


In [19]:
import spacy
import random
from spacy.language import Language

nlp = spacy.load('en_core_web_sm')

# List of replacement names
replacement_names = ['Michael Jackson', 'Madonna', 'Beyonce', 'Prince', 'Elvis Presley']

@Language.component("substitute_entities")
def substitute_entities(doc):
    substituted_tokens = []
    for token in doc:
        if token.ent_type_ == 'PERSON':
            # Substitute the entity with a random name from the list
            substituted_name = random.choice(replacement_names)
            # Split the name into tokens and use the appropriate token
            substituted_tokens.extend(substituted_name.split())
            replacement_names.remove(substituted_name)  # To avoid repeated names in the same doc
        else:
            substituted_tokens.append(token.text)
    return nlp.make_doc(' '.join(substituted_tokens))  # create new Doc object

nlp.add_pipe("substitute_entities", last=True)

texts = ['John Doe lives in New York and works at Google.', 
         'Alice is studying at the University of Oxford.']

for text in texts:
    doc = nlp(text)
    print('Substituted Text:', doc.text)

Substituted Text: Prince Elvis Presley lives in New York and works at Google .
Substituted Text: Beyonce is studying at the University of Oxford .
