In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")
import logging

# Get the logger that produces the warning message
logger = logging.getLogger('langchain_text_splitters.base')

# Set the logging level to a higher level such as ERROR or CRITICAL
logger.setLevel(logging.ERROR)

### Install necessary packages

In [2]:
#install necessray packages
!pip install -q -U torch tensorflow transformers langchain  faiss-cpu sentence_transformers
!pip install -q peft==0.4.0 trl==0.4.7 accelerate==0.21.0 bitsandbytes==0.41.3
!pip install pypdf PyPDF2



### Import packages

In [3]:
#import packages
import os
import torch
from transformers import (
    BitsAndBytesConfig,
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
)

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import AsyncChromiumLoader
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from PyPDF2 import PdfReader
import json

### 1.	Develop a Python script to accept NER responses as input.

In [4]:
# This cell get NER resonse from json and accept NER resonses as input

def get_NER(file_name):
    types = ['ADRESS', 'DATE', 'EMAIL', 'MANUAL_MARKED', 'NAME',
             'ORGANIZATION', 'METRICS']
    # dict structrure of false positive
    false_positives = dict()
    false_positives['ADRESS'] = []
    false_positives['DATE'] = []
    false_positives['EMAIL'] = []
    false_positives['MANUAL_MARKED'] = []
    false_positives['NAME'] = []
    false_positives['ORGANIZATION'] = []
    false_positives['METRICS'] = []

    #dict structure of NER
    NER = dict()
    NER['ADRESS'] = []
    NER['DATE'] = []
    NER['EMAIL'] = []
    NER['MANUAL_MARKED'] = []
    NER['NAME'] = []
    NER['ORGANIZATION'] = []
    NER['METRICS'] = []


    # Opening JSON file
    f = open(file_name, encoding="utf-8")

    # returns JSON object as
    # a dictionary
    data = json.load(f)

    for n in range(len(data[0])):

        try:
            # number of false positives
            false_p_n = data[0][n]['matches']['false_positives']['entity_page_mapping']

            #add false positives
            for k in range(len(false_p_n)):
                false_positives[types[n]].extend([false_p_n[k]['text']])
                NER[types[n]].extend([false_p_n[k]['text']])

        except:
            pass

        try:
            # number of false negative
            false_n_n = data[0][n]['matches']['false_negative']['entity_page_mapping']

            # add false negative
            for k in range(len(false_n_n)):
                NER[types[n]].extend([false_n_n[k]['text']])
        except:
            pass

        try:
            # number of True positve
            true_p_n = data[0][n]['matches']['true_positive']['entity_page_mapping']

            # add false negative
            for k in range(len(true_p_n)):
                NER[types[n]].extend([true_p_n[k]['text']])
        except:
            pass

    return NER, false_positives

NER, false_positives = get_NER('ner1.json')

LLM analyze the context of NER entities

### 2.	Integrate a Language Model (LLM) into the program to analyze the context of entities.

In [5]:
#Tokenizer is defined here. Tokenizer model is loaded from pretrained Mistral 7B model
#LLM model is loaed from pretrained Mistral 7B model

# tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",
                                          trust_remote_code=True,
                                          use_auth_token='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')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# define quantization config file
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type= "nf4", #fp4 or nf4,
    bnb_4bit_compute_dtype=  "float16",
    bnb_4bit_use_double_quant=False,
)

# Load pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    use_auth_token='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'
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# Define text pipeline
# This pipline defines processes of LLM for NER analysis
text_pipeline = pipeline(
    temperature=0.2,
    tokenizer=tokenizer,
    task="text-generation",
    model=model,
    repetition_penalty=1.12,
    return_full_text=True,
    max_new_tokens=290,
)

In [7]:
# define hugging face pipeline
# mistal model is run through the HuggingFacePipeline class
mistral_llm = HuggingFacePipeline(pipeline=text_pipeline)

In [8]:
# This cell defines language model for NER responses

# define prompt template
prompt_template = """
### [INST] Instruction: Analyse NER data. NER data are given in the input data.
 NER data is dictionary

###
INPUT:
{input} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["input"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

# Build RAG chain
rag_chain = (
 {"input": RunnablePassthrough()}
    | llm_chain
)

# define function for quesion and answer
def analyze_NER(input):
  result = rag_chain.invoke(input)
  answer = result['text']
  print('Answer:')
  print(answer.split('[/INST]')[-1])


  warn_deprecated(


In [9]:
analyze_NER(NER)

Answer:

  Based on the given NER (Named Entity Recognition) data, we have the following key-value pairs:

  1. 'ADRESS': []
     This key represents a location or an address. The current value is an empty list.

  2. 'DATE': ['1 July 2009', '1 March 2020', '24-02-2020']
      This key represents dates. The current value is a list containing three date strings.

  3. 'EMAIL': []
       This key represents email addresses. The current value is an empty list.

  4. 'MANUAL_MARKED': []
        This key may represent manually marked entities. However, the current value is an empty list and it's not clear what these entities might be without additional context.

  5. 'NAME': ['Sergey Balk']
         This key represents names of people or organizations. The current value is a list containing one name string: "Sergey Balk".

  6. 'ORGANIZATION': ['Assistance', 'Chi utive Officer Globality S.A', 'Czech Republic Company', 'Data Controller', 'Data Processor', 'Ergo', 'Ergo Group', 'Ergo Group AG

### 3.	Implement a filtering mechanism to identify and remove false positives from the NER responses.

In [10]:
# This cell defines language model for NER responses
# define prompt template to identify and remove false poitives from the NER reponse
prompt_template = """
### [INST] Instruction:  please identify and remove false positives from the NER data.
 NER data is dictionary
 Please output the filtered NER data and false positive data

###
INPUT:
{input} [/INST]
"""
# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["input"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

# Build RAG chain
rag_chain = (
 {"input": RunnablePassthrough()}
    | llm_chain
)

# define function for quesion and answer
def remove_false_positive_NER(input):
  result = rag_chain.invoke(input)
  answer = result['text']
  print('Answer:')
  print(answer.split('[/INST]')[-1])

In [11]:
NER, _  = get_NER('ner2.json')
remove_false_positive_NER(NER)

Answer:

  OUTPUT:
  {'ADRESS': ['Europe South New Bangkok, Beijing, Sydney, Africa'], 'DATE': ['1 January 2012', '11 September 2009'], 'EMAIL': [], 'MANUAL_MARKED': [], 'NAME': ['Steichen', 'Velde'], 'ORGANIZATION': ['Assurances', 'Azur', 'Azur Euro Center', 'Center', 'Commissariat aux Assurances', 'Complete Bank', 'Coo fpp-cind', 'Dkv Globality', 'Dkv Globality S.A', 'Dkv Globality S.A. ("Dkv Globality"', "Dkv Globality's", 'Euro', 'Euro Center', 'Euro Center Holding', 'Euro Center Holding A/S Frederiksberg Allé', 'Euro Center Holding a/S ("Euro Center"', "Euro Center's", 'Euro Centers', 'Euro-Center', 'Euro-Center Holding', 'Glbality', 'Munich Re Group', 'notifying Party'], 'METRICS': []}
  
 False Positives: ['Azur Euro Center Holding', 'Euro Center Holding A/S ("Euro Center"', "Euro Center'


False positives are identified and removed from the NER responses

### 4.	Optimize the program for efficiency and scalability, considering large volumes of NER data.

In [12]:
# Define text pipeline
# This pipline defines processes of LLM for NER analysis
# parameters for pipline are set as follows
# temperature=0.18: This parameter controls the randomness of the generated text. A lower temperature value results in more deterministic output,
# while a higher value allows for more randomness.
# repetition_penalty=1.2: This parameter controls the likelihood of the model repeating the same words or phrases in the generated text.
# A higher repetition penalty discourages repetitive output.
# max_new_tokens=310: This parameter sets the maximum number of new tokens that can be generated by the model.
# It limits the length of the generated text to prevent excessively long outputs.

text_pipeline2 = pipeline(
    temperature=0.18,
    tokenizer=tokenizer,
    task="text-generation",
    model=model,
    repetition_penalty=1.2,
    return_full_text=True,
    max_new_tokens=310,
)
# define hugging face pipeline
# mistal model is run through the HuggingFacePipeline class
mistral_llm2 = HuggingFacePipeline(pipeline=text_pipeline2)

In [13]:
# This cell defines language model for NER responses

# define prompt template
prompt_template = """
### [INST] Instruction:  please identify and remove false positives from the NER data.
 NER data is dictionary
 Please output the filtered NER
 please output false positive data in unrepeat format


###
INPUT:
{input} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["input"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm2, prompt=prompt)

# Build RAG chain
rag_chain = (
 {"input": RunnablePassthrough()}
    | llm_chain
)

# define function for quesion and answer
def remove_false_positive_NER2(input):
  result = rag_chain.invoke(input)
  answer = result['text']
  print('Answer:')
  print(answer.split('[/INST]')[-1])

In [14]:
NER, _  = get_NER('ner3.json')
remove_false_positive_NER2(NER)

Answer:

  Based on my analysis, here are the false positives that need to be removed from the given NER data:

False Positives:
['City: D-20010 Hamburg', 'Country: Germany']

Filtered NER Data:
{'ADRESS': [...], 'DATE': [...], 'EMAIL': [], 'MANUAL_MARKED': [...], 'NAME': [...], 'ORGANIZATION': ['Danske Bank Hamburg Corporate Banking', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', 'Danske', 'Danske Bank Hamburg', '


False positives are identified and removed from the NER responses