<a href="https://colab.research.google.com/github/tesims/automated-video-editor/blob/main/googleai_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index llama-index-postprocessor-presidio
!python -m spacy download en_core_web_lg



# Title  
For this example I just used OAuth.

```
# This is formatted as code
```



In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine, OperatorConfig
from presidio_anonymizer.operators import Operator, OperatorType

from typing import Dict
from pprint import pprint

text = "Peter gave his book to Heidi which later gave it to Nicole. Peter lives in London and Nicole lives in Tashkent."
print("original text:")
pprint(text)
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text, language="en")
print("analyzer results:")
pprint(analyzer_results)

class InstanceCounterAnonymizer(Operator):
    """
    Anonymizer which replaces the entity value
    with an instance counter per entity.
    """

    REPLACING_FORMAT = "<{entity_type}_{index}>"

    def operate(self, text: str, params: Dict = None) -> str:
        """Anonymize the input text."""

        entity_type: str = params["entity_type"]

        # entity_mapping is a dict of dicts containing mappings per entity type
        entity_mapping: Dict[Dict:str] = params["entity_mapping"]

        entity_mapping_for_type = entity_mapping.get(entity_type)
        if not entity_mapping_for_type:
            new_text = self.REPLACING_FORMAT.format(
                entity_type=entity_type, index=0
            )
            entity_mapping[entity_type] = {}

        else:
            if text in entity_mapping_for_type:
                return entity_mapping_for_type[text]

            previous_index = self._get_last_index(entity_mapping_for_type)
            new_text = self.REPLACING_FORMAT.format(
                entity_type=entity_type, index=previous_index + 1
            )

        entity_mapping[entity_type][text] = new_text
        return new_text

    @staticmethod
    def _get_last_index(entity_mapping_for_type: Dict) -> int:
        """Get the last index for a given entity type."""

        def get_index(value: str) -> int:
            return int(value.split("_")[-1][:-1])

        indices = [get_index(v) for v in entity_mapping_for_type.values()]
        return max(indices)

    def validate(self, params: Dict = None) -> None:
        """Validate operator parameters."""

        if "entity_mapping" not in params:
            raise ValueError("An input Dict called `entity_mapping` is required.")
        if "entity_type" not in params:
            raise ValueError("An entity_type param is required.")

    def operator_name(self) -> str:
        return "entity_counter"

    def operator_type(self) -> OperatorType:
        return OperatorType.Anonymize

# Create Anonymizer engine and add the custom anonymizer
anonymizer_engine = AnonymizerEngine()
anonymizer_engine.add_anonymizer(InstanceCounterAnonymizer)

# Create a mapping between entity types and counters
entity_mapping = dict()

# Anonymize the text

anonymized_result = anonymizer_engine.anonymize(
    text,
    analyzer_results,
    {
        "DEFAULT": OperatorConfig(
            "entity_counter", {"entity_mapping": entity_mapping}
        )
    },
)

print(anonymized_result.text)

pprint(entity_mapping, indent=2)




original text:
('Peter gave his book to Heidi which later gave it to Nicole. Peter lives in '
 'London and Nicole lives in Tashkent.')
configuration file /usr/local/lib/python3.10/dist-packages/conf/default.yaml not found.  Using default config: {'nlp_engine_name': 'spacy', 'models': [{'lang_code': 'en', 'model_name': 'en_core_web_lg'}]}.




configuration file is missing 'ner_model_configuration'. Using default




model_to_presidio_entity_mapping is missing from configuration, using default




low_score_entity_names is missing from configuration, using default




labels_to_ignore is missing from configuration, using default
analyzer results:
[type: PERSON, start: 0, end: 5, score: 0.85,
 type: PERSON, start: 23, end: 28, score: 0.85,
 type: PERSON, start: 52, end: 58, score: 0.85,
 type: PERSON, start: 60, end: 65, score: 0.85,
 type: LOCATION, start: 75, end: 81, score: 0.85,
 type: PERSON, start: 86, end: 92, score: 0.85,
 type: LOCATION, start: 102, end: 110, score: 0.85]
<PERSON_1> gave his book to <PERSON_2> which later gave it to <PERSON_0>. <PERSON_1> lives in <LOCATION_1> and <PERSON_0> lives in <LOCATION_0>.
{ 'LOCATION': {'London': '<LOCATION_1>', 'Tashkent': '<LOCATION_0>'},
  'PERSON': { 'Heidi': '<PERSON_2>',
              'Nicole': '<PERSON_0>',
              'Peter': '<PERSON_1>'}}


In [None]:
import logging
import sys
import re

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core.postprocessor import (
    PIINodePostprocessor,
    NERPIINodePostprocessor,
)
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.postprocessor.presidio import PresidioPIINodePostprocessor
from llama_index.core.schema import NodeWithScore

def extract_variables(presidio_new_nodes):
    """Extracts variables from Presidio output and groups them by sector."""
    sectors = {}

    for node in presidio_new_nodes:
        pii_type = node.node.metadata.get("__pii_node_info__")
        value = node.text

        # Handle potential nested dictionaries in pii_type
        if isinstance(pii_type, dict):
            for key, sub_value in pii_type.items():
                sector = key  # The top-level key represents the sector
                label = sub_value
                full_label = f"<{sector}_{label}>"
                sectors.setdefault(sector, []).append(value)

        else:
            match = re.match(r"<(.+)>", pii_type)  # Extract sector from label
            if match:
                sector = match.group(1)
                sectors.setdefault(sector, []).append(value)

    return sectors

text = """
Therapist Notes:
Patient Name: Sarah Johnson
DOB: 05/12/1985
Session Date: 04/02/2024
Sarah discussed her ongoing struggles with anxiety and depression. She mentioned that her symptoms have worsened since losing her job at TechCorp Inc. last month. Sarah revealed that she has been having suicidal thoughts and has considered overdosing on her prescribed Xanax medication. She expressed feeling hopeless about her future and her ability to provide for her two children, Emily (age 8) and Jacob (age 5). Sarah also shared that her mother, Mary Johnson, was recently diagnosed with breast cancer, which has added to her stress and feelings of overwhelm.

Session Transcript:
Therapist: Good morning, Michael. How have you been feeling since our last session?
Michael: Not great, to be honest. I've been really struggling with my PTSD symptoms lately. The nightmares about my deployment in Afghanistan have been more frequent and intense. I keep reliving the IED explosion that killed my best friend, Chris Thompson. It happened on August 15, 2019, and I can't seem to shake the guilt and the memory of seeing his body torn apart. I've been self-medicating with alcohol more often, usually drinking a fifth of vodka each night just to fall asleep. My wife, Jessica, is really worried about me, and I'm scared that my drinking is going to ruin our marriage. I don't know how much longer I can keep going like this.
"""
presidio_node = TextNode(text=text)


processor = PresidioPIINodePostprocessor()


presidio_new_nodes = processor.postprocess_nodes(
    [NodeWithScore(node=presidio_node)]
)


# After Presidio processing completes:
results = presidio_new_nodes[0].node.metadata["__pii_node_info__"]

# Save the dictionary as a variable
pii_data = results

# Now you can use the 'pii_data' variable
print(pii_data)



In [None]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig, EngineResult, RecognizerResult
from faker import Faker


fake = Faker()

# Create faker function (note that it has to receive a value)
def fake_name(x):
    return fake.name()


# Create custom operator for the PERSON entity
operators = {"PERSON": OperatorConfig("custom", {"lambda": fake_name})}

# Analyzer output
analyzer_results = [RecognizerResult(entity_type="PERSON", start=11, end=18, score=0.8)]

text_to_anonymize = "My name is Raphael and I like to fish."

anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize, analyzer_results=analyzer_results, operators=operators
)

print(anonymized_results.text)

ModuleNotFoundError: No module named 'faker'