# Spacy

https://spacy.io/usage/spacy-101

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def mask_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # PERSON: People, including fictional characters
            text = text.replace(ent.text, "[PERSON-REDACTED]")
        elif ent.label_ == "NORP":
            # NORP: Nationalities or religious or political groups
            text = text.replace(ent.text, "[NORP-REDACTED]")
    return text

In [None]:
print(mask_entities("My name is John Abraham and I am an Indian"))

In [None]:
print(mask_entities("Noah's Ark is a large ship mentioned in the Book of Genesis in the Bible. According to the story, God told Noah to build the ark to save himself, his family, and a pair of every kind of animal from a great flood that would destroy the rest of the world. The ark was built to be 300 cubits long, 50 cubits wide, and 30 cubits high (approximately 134 meters long, 22 meters wide, and 13 meters high). It had three decks and was made of gopher wood."))

In [None]:
print(mask_entities("Chandrayaan-3 is the third Indian lunar exploration mission under the Indian Space Research Organisation's Chandrayaan programme. It was launched on July 14, 2023, and successfully soft-landed on the moon on August 23, 2023. "))

#SciSpacy

https://spacy.io/universe/project/scispacy

Pre-requisites

In [None]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz

Load & run

In [None]:
import spacy
import scispacy

nlp = spacy.load("en_ner_bc5cdr_md")

def redact_drug_names(text):
    doc = nlp(text)
    for ent in doc.ents:
        print(ent, ent.label_)
        if ent.label_ == "CHEMICAL":
            text = text.replace(ent.text, "[REDACTED]")
    return text

In [None]:
text = "The patient was prescribed aspirin and ibuprofen for their headache."
redacted_text = redact_drug_names(text)
print(redacted_text)

#Stanza

https://stanfordnlp.github.io/stanza/

Pre-requisites

In [None]:
pip install stanza

Downloading the model

In [None]:
import stanza
stanza.download('en') # download English model

Intializing the neural pipeline

In [None]:
nlp = stanza.Pipeline('en') # initialize English neural pipeline
doc = nlp("Narendra Damodardas Modi is an Indian politician.") # run annotation over a sentence

In [None]:
print(doc.entities)

# Presidio

https://github.com/microsoft/presidio

## JSON Text Anonymization

Install

In [None]:
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

Setup Imports

In [None]:
from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint

import pandas as pd

from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
from presidio_anonymizer.entities import EngineResult

In [None]:
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
batch_anonymizer = BatchAnonymizerEngine()

In [None]:
nested_dict = {
    "key_a": {"key_a1": "My phone number is 212-121-1424"},
    "key_b": {"www.abc.com"},
    "key_c": 3,
    "names": ["James Bond", "Clark Kent", "Hakeem Olajuwon", "No name here!"]
}


In [None]:
# Analyze dict
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en")

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)

## DCIOM Image Anonymization

Install

In [None]:
! apt install tesseract-ocr
! apt install libtesseract-dev

In [None]:
! pip install Pillow
! pip install pytesseract

In [None]:
!pip install presidio_analyzer
!pip install presidio_anonymizer
!pip install presidio_image_redactor

# Presidio image redactor uses the presidio-analyzer
# which requires a spaCy language model:
!python -m spacy download en_core_web_lg

In [None]:
import glob
from pathlib import Path
import matplotlib.pyplot as plt
import pydicom
from presidio_image_redactor import DicomImageRedactorEngine

In [None]:
def compare_dicom_images(
    instance_original: pydicom.dataset.FileDataset,
    instance_redacted: pydicom.dataset.FileDataset,
    figsize: tuple = (11, 11)
) -> None:
    """Display the DICOM pixel arrays of both original and redacted as images.

    Args:
        instance_original (pydicom.dataset.FileDataset): A single DICOM instance (with text PHI).
        instance_redacted (pydicom.dataset.FileDataset): A single DICOM instance (redacted PHI).
        figsize (tuple): Figure size in inches (width, height).
    """
    _, ax = plt.subplots(1, 2, figsize=figsize)
    ax[0].imshow(instance_original.pixel_array, cmap="gray")
    ax[0].set_title('Original')
    ax[1].imshow(instance_redacted.pixel_array, cmap="gray")
    ax[1].set_title('Redacted')

In [None]:
engine = DicomImageRedactorEngine()

In [None]:
# Single DICOM (.dcm) file or directory containing DICOM files
input_path = 'sample_data/input'

# Directory where the output will be written
output_parent_dir = 'sample_data/output'

In [None]:
# Redact text PHI from DICOM images
engine.redact_from_directory(
    input_dicom_path = input_path,
    output_dir = output_parent_dir,
    fill="contrast"
)

In [None]:
# Original DICOM images
p = Path(input_path).glob("**/*.dcm")
original_files = [x for x in p if x.is_file()]

# Redacted DICOM images
p = Path(output_parent_dir).glob("**/*.dcm")
redacted_files = [x for x in p if x.is_file()]

In [None]:
for i in range(0, len(original_files)):
    original_file = pydicom.dcmread(original_files[i])
    redacted_file = pydicom.dcmread(redacted_files[i])

    compare_dicom_images(original_file, redacted_file)

## PDF Anonymization

Prerequisites

In [None]:
!pip install presidio_analyzer
!pip install presidio_anonymizer
!python -m spacy download en_core_web_lg
!pip install pdfminer.six
!pip install pikepdf

In [None]:
# For Presidio
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# For console output
from pprint import pprint

# For extracting text
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine

# For updating the PDF
from pikepdf import Pdf, AttachedFileSpec, Name, Dictionary, Array

Analyze the text in the PDF

In [None]:
analyzer = AnalyzerEngine()

analyzed_character_sets = []

for page_layout in extract_pages("./sample_data/sample.pdf"):
    for text_container in page_layout:
        if isinstance(text_container, LTTextContainer):

            # The element is a LTTextContainer, containing a paragraph of text.
            text_to_anonymize = text_container.get_text()

            # Analyze the text using the analyzer engine
            analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')

            if text_to_anonymize.isspace() == False:
                print(text_to_anonymize)
                print(analyzer_results)

            characters = list([])

            # Grab the characters from the PDF
            for text_line in filter(lambda t: isinstance(t, LTTextLine), text_container):
                    for character in filter(lambda t: isinstance(t, LTChar), text_line):
                            characters.append(character)


            # Slice out the characters that match the analyzer results.
            for result in analyzer_results:
                start = result.start
                end = result.end
                analyzed_character_sets.append({"characters": characters[start:end], "result": result})

Create phrase bounding boxes

In [None]:
# Combine the bounding boxes into a single bounding box.
def combine_rect(rectA, rectB):
    a, b = rectA, rectB
    startX = min( a[0], b[0] )
    startY = min( a[1], b[1] )
    endX = max( a[2], b[2] )
    endY = max( a[3], b[3] )
    return (startX, startY, endX, endY)

analyzed_bounding_boxes = []

# For each character set, combine the bounding boxes into a single bounding box.
for analyzed_character_set in analyzed_character_sets:
    completeBoundingBox = analyzed_character_set["characters"][0].bbox

    for character in analyzed_character_set["characters"]:
        completeBoundingBox = combine_rect(completeBoundingBox, character.bbox)

    analyzed_bounding_boxes.append({"boundingBox": completeBoundingBox, "result": analyzed_character_set["result"]})

Add highlight annotations

In [None]:
pdf = Pdf.open("./sample_data/sample.pdf")

annotations = []

# Create a highlight annotation for each bounding box.
for analyzed_bounding_box in analyzed_bounding_boxes:

    boundingBox = analyzed_bounding_box["boundingBox"]

    # Create the annotation.
    # We could also create a redaction annotation if the ongoing workflows supports them.
    highlight = Dictionary(
        Type=Name.Annot,
        Subtype=Name.Highlight,
        QuadPoints=[boundingBox[0], boundingBox[3],
                    boundingBox[2], boundingBox[3],
                    boundingBox[0], boundingBox[1],
                    boundingBox[2], boundingBox[1]],
        Rect=[boundingBox[0], boundingBox[1], boundingBox[2], boundingBox[3]],
        C=[1, 0, 0],
        CA=0.5,
        T=analyzed_bounding_box["result"].entity_type,
    )

    annotations.append(highlight)

# Add the annotations to the PDF.
pdf.pages[0].Annots = pdf.make_indirect(annotations)

# And save.
pdf.save("./sample_data/sample_annotated.pdf")