In [None]:
import os
import pprint
import streamlit as st 
import pandas as pd 
import dotenv
from openai import AzureOpenAI
from itertools import chain
from typing import Dict, List, Optional, Tuple
from googleapiclient.discovery import build
import requests
from bs4 import BeautifulSoup

In [None]:
import spacy
from spacy.tokens import Doc, Span
from spacy.language import Language

In [None]:
from annotated_text import annotated_text
from presidio_analyzer import (
    AnalyzerEngine,
    RecognizerResult,
    RecognizerRegistry,
    PatternRecognizer,
    Pattern,
)
from presidio_analyzer.nlp_engine import NlpEngine
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import (
    NlpEngine,
    NlpEngineProvider,
)
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

In [None]:
OPENAI_TYPE="Azure" #or "openai"
OPENAI_KEY="2ab00caaf066494089f9982725837f61"
OPENAI_API_VERSION="2023-09-15-preview"
AZURE_OPENAI_ENDPOINT="https://aai-applications-openai-services.openai.azure.com/"
AZURE_OPENAI_DEPLOYMENT="AAI-Applications-Completion"
GOOGLE_CUSTOM_SEARCH_ID="7706819d237454897"
GOOGLE_CUSTOM_SEARCH_KEY="AIzaSyBA90JEeY0-elLrs5uwXr4-6i1BfLAdMVY"

In [None]:
client = AzureOpenAI(
    azure_endpoint = AZURE_OPENAI_ENDPOINT, 
    api_key = OPENAI_KEY,  
    api_version=OPENAI_API_VERSION
)

In [None]:
model_list = [
            "spaCy/en_core_web_lg",
            "flair/ner-english-large",
            "HuggingFace/obi/deid_roberta_i2b2",
            "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
            "stanza/en",
            ]

In [None]:
args = {}
args["google_engine_id"] = GOOGLE_CUSTOM_SEARCH_ID
args["openai_client"] = client
args["custom_search_key"] = GOOGLE_CUSTOM_SEARCH_KEY
args["q"] = "Alison Goldfrapp"
args["k"] = 50
args["model"] = "spaCy/en_core_web_lg"
args["https_flag"] = True
args["remove_entities"] = ["DATE_TIME", "AU_ACN", "AU_ABN", "AU_MEDICARE", "AU_TFN", "IN_AADHAAR", "IN_VEHICLE_REGISTRATION", "IN_PAN", "SG_NRIC_FIN"]

In [None]:
class QueryExecutor:
    "Creates a QueryExecutor object"

    def __init__(self, args) -> None:
        """
        Initialize a QueryExecutor object
        Instance Variables:
            query: the query string
            k: the number of tuples that we request in the output
            google_engine_id: the Google Custom Search Engine ID
            client: OpenAI client
            engine: the Google Custom Search Engine
            seen_urls: the set of URLs that we have already seen
            used_queries: the set of queries that we have already used
            extractor: the extractor object 
        """

        self.q = args["q"]
        self.k = args["k"]
        self.custom_search_key = args["custom_search_key"]
        self.google_engine_id = args["google_engine_id"]
        self.client = args["openai_client"]
        self.https_flag = args["https_flag"]
        self.engine = build("customsearch", "v1", developerKey=args["custom_search_key"])
        self.seen_urls = set()
        self.used_queries = set([self.q])
        self.spacy_model_package = args["model"].split("/")[0]
        self.spacy_model = "/".join(args["model"].split("/")[1:])
        self.model = Presidio(self.spacy_model_package, self.spacy_model)
        self.extractor = presidioExtractor()

    def getQueryResult(self, query: str, k, https_flag: bool) -> List:
        """
        Get the top 10 results for a given query from Google Custom Search API
        Source: https://github.com/googleapis/google-api-python-client/blob/main/samples/customsearch/main.py
        """
        if https_flag:
            full_res = (
                self.engine.cse()
                .list(
                    q=query,
                    cx=self.google_engine_id,
                )
                .execute()
            )
        else:
            url = f"https://www.googleapis.com/customsearch/v1?key={self.custom_search_key}&cx={self.google_engine_id}&q={query}"
            full_res = requests.get(url, verify=False).json()
         
        return full_res["items"][0 : k + 1]


    def processText(self, url: str) -> Optional[str]:
        """
        Get the tokens from a given URL
        If webpage retrieval fails (e.g. because of a timeout), it is skipped (None returned)

        Extracts the plain text from the URL using Beautiful Soup.
        If the resulting plain text is longer than 10,000 characters, it is truncated.
        Only the text in the <p> tags is processed.

        Parameters:
            url (str) - the URL to process
        Returns:
            List[str] - the list of tokens
        """

        try:
            print("        Fetching text from url ...")
            page = requests.get(url, timeout=5, verify=self.https_flag)
        except requests.exceptions.Timeout:
            print(f"Error processing {url}: The request timed out. Moving on...")
            return None
        try:
            soup = BeautifulSoup(page.content, "html.parser")
            html_blocks = soup.find_all("p")
            text = ""
            for block in html_blocks:
                text += block.get_text()

            if text != "":
                text_len = len(text)
                print(
                    f"        Trimming webpage content from {text_len} to 10000 characters"
                )
                preprocessed_text = (text[:10000]) if text_len > 10000 else text
                print(
                    f"        Webpage length (num characters): {len(preprocessed_text)}"
                )
                # Removing redundant newlines and some whitespace characters.
                preprocessed_text = re.sub("\t+", " ", preprocessed_text)
                preprocessed_text = re.sub("\n+", " ", preprocessed_text)
                preprocessed_text = re.sub(" +", " ", preprocessed_text)
                preprocessed_text = preprocessed_text.replace("\u200b", "")

                return preprocessed_text
            else:
                return None
        except Exception as e:
            print(f"Error processing {url}: {e}. Moving on ...")
            return None

    def parseResult(self, result: Dict[str, str]) -> None:
        """
        Parse the result of a query.
        Exposed function for use by main function.
        Parameters:
            result (dict) - one item as returned as the result of a query
        Returns:
            None
        """
        url = result["link"]
        if url not in self.seen_urls:
            self.seen_urls.add(url)
            text = self.processText(url)
            if not text:
                return None
        return text

    def checkContinue(self) -> bool:
        """
        Evaluate if we have evaluated at least k tuples, ie continue or halt.
        Parameters: None
        Returns: bool (True if we need to find more relations, else False)
        """
        
        lst = list(chain.from_iterable(self.extractor.related_entities))
        ent_count = len(set(lst))
        print(f"entity count: {ent_count}")
        return ent_count < self.k

    def getNewQuery(self) -> Optional[str]:
        """
        Creates a new query.
        Select from X a tuple y such that y has not been used for querying yet
        Create a query q from tuple y by concatenating
        the attribute values together.
        If no such y tuple exists, then stop/return None.
        (ISE has "stalled" before retrieving k high-confidence tuples.)

        Parameters:
            None
        Returns:
            query (str) if available; else None
        """
        # Iterating through extracted tuples
        for relation in list(self.extractor.related_entities):
            tmp_query = " ".join(relation)
            # Checking if query has been used
            if tmp_query not in self.used_queries:
                # Adding query to used queries
                self.used_queries.add(relation)
                # Setting new query
                self.q = tmp_query
                return self.q
            # No valid query found
            return None


In [None]:
"Presidio  class"
import json
import re
from typing import List, Set, Tuple

import openai
import spacy


class Presidio:
    """
    Presidio class
    """
    def __init__(self, spacy_model_package, spacy_model):
        """
        Initialize a presidioPredictor object
        Parameters:
            model: the spaCy model to use
        """
        self.model_package = spacy_model_package
        self.model = spacy_model
        self.ta_key = ""
        self.ta_endpoint = ""
        self.nlp_engine, self.registry = self.nlp_engine_and_registry(self.model)
        self.analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine, registry=self.registry)
        self.entities = self.analyzer.get_supported_entities()

                
    def nlp_engine_and_registry(
        self,
        model_path: str,
        ) -> Tuple[NlpEngine, RecognizerRegistry]:
        """Create the NLP Engine instance based on the requested model.
        :param model_family: Which model package to use for NER.
        :param model_path: Which model to use for NER. E.g.,
            "StanfordAIMI/stanford-deidentifier-base",
            "obi/deid_roberta_i2b2",
            "en_core_web_lg"
        :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
        :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
        """
        nlp_configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": "en", "model_name": model_path}],
            "ner_model_configuration": {
              "model_to_presidio_entity_mapping": {
                "PER": "PERSON",
                "PERSON": "PERSON",
                "NORP": "NRP",
                "FAC": "FACILITY",
                "LOC": "LOCATION",
                "GPE": "LOCATION",
                "LOCATION": "LOCATION",
                "ORG": "ORGANIZATION",
                "ORGANIZATION": "ORGANIZATION",
                "DATE": "DATE_TIME",
                "TIME": "DATE_TIME",
               },
               "low_confidence_score_multiplier": 0.4,
               "low_score_entity_names": ["ORG", "ORGANIZATION"],
             },
        }
        nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
        registry = RecognizerRegistry()
        registry.load_predefined_recognizers(nlp_engine=nlp_engine)

        return nlp_engine, registry
      

    def annotate(self, text: str, analyze_results: List[RecognizerResult]):
        """Highlight the identified PII entities on the original text
        :param text: Full text
        :param analyze_results: list of results from presidio analyzer engine
        """
        tokens = []

        # Use the anonymizer to resolve overlaps
        results = self.extractor.anonymize(
            text=text,
            operator="highlight",
            analyze_results=analyze_results,
        )

        # sort by start index
        results = sorted(results.items, key=lambda x: x.start)
        for i, res in enumerate(results):
            if i == 0:
                tokens.append(text[: res.start])

            # append entity text and entity type
            tokens.append((text[res.start : res.end], res.entity_type))

            # if another entity coming i.e. we're not at the last results element, add text up to next entity
            if i != len(results) - 1:
                tokens.append(text[res.end : results[i + 1].start])
            # if no more entities coming, add all remaining text
            else:
                tokens.append(text[res.end :])
        return tokens

    def create_ad_hoc_deny_list_recognizer(
        deny_list=Optional[List[str]],
    ) -> Optional[PatternRecognizer]:
        if not deny_list:
            return None

        deny_list_recognizer = PatternRecognizer(
            supported_entity="GENERIC_PII", deny_list=deny_list
        )
        return deny_list_recognizer

    def create_ad_hoc_regex_recognizer(
        regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
    ) -> Optional[PatternRecognizer]:
        if not regex:
            return None
            pattern = Pattern(name="Regex pattern", regex=regex, score=score)
            regex_recognizer = PatternRecognizer(
                supported_entity=entity_type, patterns=[pattern], context=context
            )
            return regex_recognizer

In [None]:
"Presidio Extractor class"
import json
import re
from typing import List, Set, Tuple

import openai
import spacy


class presidioExtractor:
    """
    Presidio Extractor class
    """

    def __init__(self, model="en_core_web_lg"):
        """
        Initialize a presidioPredictor object
        Parameters:
            model: the spaCy model to use
        """
        self.nlp = spacy.load(model)
        self.related_entities = set()

    def get_related_entities(self, text: str, analyze_results: List[RecognizerResult]) ->  pd.DataFrame:
        """
        Exposed function to take in text and return named entities
        Parameters:
            text: the text to extract entities from
        Returns:
            entities: a list of tuples of the form (subject, object)
        """
        df = pd.DataFrame.from_records([r.to_dict() for r in analyze_results])
        df["text"] = [text[res.start : res.end] for res in analyze_results]

        df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
                         {
                            "entity_type": "entity_type",
                            "text": "text",
                            "start": "start",
                            "end": "end",
                            "score": "confidence",
                        },
                        axis=1,
                    )
        df_subset["Text"] = [text[res.start : res.end] for res in analyze_results]

        analysis_explanation_df = pd.DataFrame.from_records(
                            [r.analysis_explanation.to_dict() for r in analyze_results]
        )
        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
 
        self.related_entities.add(tuple(list(df.groupby(['text']).groups))) 
        
        return df_subset

    def anonymize(
        self,
        text: str,
        operator: str,
        analyze_results: List[RecognizerResult],
        mask_char: Optional[str] = None,
        number_of_chars: Optional[str] = None,
        encrypt_key: Optional[str] = None,
    ):
        """Anonymize identified input using Presidio Anonymizer.
        :param text: Full text
        :param operator: Operator name
        :param mask_char: Mask char (for mask operator)
        :param number_of_chars: Number of characters to mask (for mask operator)
        :param encrypt_key: Encryption key (for encrypt operator)
        :param analyze_results: list of results from presidio analyzer engine
        """

        if operator == "mask":
            operator_config = {
                "type": "mask",
                "masking_char": mask_char,
                "chars_to_mask": number_of_chars,
                "from_end": False,
            }

        # Define operator config
        elif operator == "encrypt":
            operator_config = {"key": encrypt_key}
        elif operator == "highlight":
            operator_config = {"lambda": lambda x: x}
        else:
            operator_config = None

        # Change operator if needed as intermediate step
        if operator == "highlight":
            operator = "custom"
        elif operator == "synthesize":
            operator = "replace"
        else:
            operator = operator

        res = AnonymizerEngine().anonymize(
            text,
            analyze_results,
            operators={"DEFAULT": OperatorConfig(operator, operator_config)},
        )
        return res


    def annotate(self, text: str, analyze_results: List[RecognizerResult]):
        """Highlight the identified PII entities on the original text
           :param text: Full text
           :param analyze_results: list of results from presidio analyzer engine
         """
        tokens = []

        # Use the anonymizer to resolve overlaps
        results = self.anonymize(
            text=text,
            operator="highlight",
            analyze_results=analyze_results,
        )

        # sort by start index
        results = sorted(results.items, key=lambda x: x.start)
        for i, res in enumerate(results):
            if i == 0:
               tokens.append(text[: res.start])

            # append entity text and entity type
            tokens.append((text[res.start : res.end], res.entity_type))

            # if another entity coming i.e. we're not at the last results element, add text up to next entity
            if i != len(results) - 1:
                tokens.append(text[res.end : results[i + 1].start])
            # if no more entities coming, add all remaining text
            else:
                tokens.append(text[res.end :])
        return tokens

In [None]:
executor = QueryExecutor(args)

In [None]:
iterate_further = True
iterations = 0

res = []
txt = []

while iterate_further:
        # Get the top 10 results for the current query
        results = executor.getQueryResult(executor.q, 10, executor.https_flag)
        print(f"=========== Iteration: {iterations} - Query: {executor.q} ===========")
        for i, item in enumerate(results):
            print(f"URL ( {i+1} / 50): {item['link']}")
            text =  executor.processText(item['link'])
            if text:
                txt.append(text)
                analyze_results = executor.model.analyzer.analyze(
                    text=text, 
                    entities=executor.model.entities,
                    language="en",
                    return_decision_process=True
                )
                df = executor.extractor.get_related_entities(text=text,
                                                             analyze_results=analyze_results)
                res.append(df)
            if not executor.checkContinue():
                iterate_further = False
                break
            iterations += 1
        # If a new iteration is needed, get the new query
        if not executor.getNewQuery():
            print("No new queries to try")
            print("Exiting ...")
            break

print(f"Total # of iterations = {iterations}")

In [None]:
resdf = pd.concat(res)

In [None]:
plain_text = ' '.join(txt)

In [None]:
#annotated_tokens = executor.presidio.extractor.annotate(text=text, analyze_results=analyze_results)
#annotated_text(*annotated_tokens)

In [None]:
def clean_entity_names(x):
    x['clean_text'] = x['text'].split('.')[0]
    x['clean_text'] = x['clean_text'].lower()    
    return x

In [None]:
# clean df
resdf = resdf.apply(clean_entity_names,axis=1)

In [None]:
resdf

In [None]:
ent_list = []
for e in list(resdf["clean_text"]):
    ent_list.append(e.split('.')[0])
ent_set = set(ent_list)
un_ent_lst = list(ent_set)

In [None]:
un_ent_lst

### Entity Graph

In [None]:
def knowledge_graph_v1(ent_list: list, input: str):

    SYS_PROMPT = ("You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a list of entities and some text."
        "Your task is to extract the ontology of terms mentioned in the given context relating to the entities in the list."
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include person (agent), location, organization, date, duration, \n"
            "\tcondition, concept, object, entity  etc.\n"
            "\tTerms should be as atomistic as possible\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n"
        "Return this list as valid JSON like the following: \n"
        "{ \n"
        "  graph:[ \n"
        "   {\n"
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n' 
        "   }, {...}\n"
        " ] \n"
        "}"
    )

    USER_PROMPT = f"entity_list: ```{ent_list}```, context: ```{input}``` \n\n output: "

    messages=[
      {"role": "system", "content": f"{SYS_PROMPT}"},
      {"role": "user", "content": f"{USER_PROMPT}"}
    ]

    response = client.chat.completions.create(model="AAI-Applications", 
                                              response_format={ "type": "json_object" },
                                              messages=messages)

    return response.choices[0].message.content

In [None]:
import pandas as pd
import networkx as nx
from pyvis.network import Network


def create_graph(gdf: pd.DataFrame):
    G = nx.Graph()
    ## Add edges to the graph
    for index, row in gdf.iterrows():
        G.add_edge(
            str(row["node_1"].lower()),
            str(row["node_2"].lower()),
            title=row["edge"],
        )

    return G    


def create_vis(g):
    net = Network(
        notebook=True,
        bgcolor="#FFFFF",
        cdn_resources="remote",
        height="800px",
        width="100%",
        select_menu=True,
        font_color="#cccccc",
        filter_menu=False,
    )

    net.from_nx(g)
    # net.repulsion(node_distance=150, spring_length=400)
    #net.force_atlas_2based(central_gravity=0.015, gravity=-31)
    net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)

    # net.show(graph_output_directory)
    net.show_buttons(filter_=['physics'])
    net.show("knowledge_graph.html")

In [None]:
kg = knowledge_graph_v1(un_ent_lst, plain_text)

In [None]:
j = json.loads(kg) 

In [None]:
gdf = pd.DataFrame(j['graph'])

In [None]:
g = create_graph(gdf)

In [None]:
net = Network(
    notebook=True,
    bgcolor="#FFFFF",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    font_color="#6a6b6f",
    filter_menu=True,
)

net.from_nx(g)
# net.repulsion(node_distance=150, spring_length=400)
#net.force_atlas_2based(central_gravity=0.015, gravity=-31)
net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)

# net.show(graph_output_directory)
net.show_buttons(filter_=['physics'])
net.show("knowledge_graph.html")

In [None]:
nx.write_gexf(g, 'soi-entity-graph.gexf')