# Pulling NERs from LLM (openai)

##### Date: 11-13-2023
##### Author: Tanmayi Balla (tballa@iu.edu)

###### Naming Convention for the output files:
###### task_names: Abbreviations: ABBR, Named Entities: NER, KeyPhrases: KP
###### LLM Responses: "Management_<model_name>_<task_name>_response.json"
###### Extracted Results: "Management_<model_name>_<task_name>_results.json"
###### Cleaned Results: "Management_<model_name>_<task_name>_results_cleaned.json"

In [1]:
import os
import re
import openai
import time
import json
#import datetime
import timeout_decorator
from langchain.text_splitter import RecursiveCharacterTextSplitter
import secret
import datetime

Store all keys in your private `secret.py` file. This file is not synced with GitHub as specified in `.gitignore`.

In [16]:
openai.api_key = secret.openai_api_key # "<Your API-KEY>"
model = "gpt-4" # "gpt-3.5-turbo"
#model = "gpt-3.5-turbo"
#model = "gpt-4-1106-preview"
task = 'NER'

chunk_ = 1
if model in ["gpt-4-1106-preview" ]: # , ): #'gpt-4'):
    chunk_ = 0

In [17]:
INSTRUCTION = """ 
    ## Instructions ##

    You are an NLP Expert. 

    Extract all the Named Entities from the input that belongs to one of the following categories:

    {
        "CARDINAL": "Cardinal_Number",
        "DATE": "Date",
        "EVENT": "Event",
        "FAC": "Facilities",
        "GPE": "Geopolitical_Entity",
        "LANGUAGE": "Language",
        "LAW": "Law_Document",
        "LOC": "Location",
        "MONEY": "Money",
        "NORP": "Nationalities or religious or political groups",
        "ORDINAL": "Ordinal",
        "ORG": "Organisation",
        "PERCENT": "Percent",
        "PERSON": "Person",
        "PRODUCT": "Product",
        "QUANTITY": "Quantity",
        "TIME": "Time",
        "WORK_OF_ART": "Work_Of_Art"
    }

    Display the output only in the form: 

    DATE:entity1#entity2#.. <new-line> 
    EVENT:entity1#entity2#...<new-line>.
    .
    .
    .

    CARDINAL:entity1#entity2#...<new-line>
    If no entities are present for a particular category - that's fine. Do not display that category in the output. Do not extract the entities that doesn’t belong to the category list provided.

    Do not make up or guess ANY extra information. Only extract what exactly is in the text.

    ### Examples ###
    Sample Input: Mary works at Apple and has received $1.5 million for a product launch on Dec 23, 2022, at Cupertino, California, United States. This event took place around 5PM. After the event, Mary reached home by 7PM and she first started reading Hamlet by Shakespeare. Later, she had 2 eggs for dinner and started booking 3 tickets for Paris, to visit the Eiffel Tower, along with Peter and Lisa. She has chosen the Hilton Hotel for her stay. The tickets have reduced her bank balance to 15%.

    Expected Results:

    {CARDINAL#3#2 \n
    DATE#Dec 23, 2022\n
    EVENT#product launch\n
    FAC#Eiffel Tower#Hilton Hotel\n
    GPE#Paris#United States\n
    LOC#Cupertino#California\n
    MONEY#$1.5 million\n
    ORDINAL#first\n

    ORG#Apple\n
    PERCENT#15%\n
    PERSON#Mary#Shakespeare#Peter#Lisa\n
    QUANTITY#2\n
    TIME#5PM#7PM\n
    WORK_OF_ART#Hamlet\n}

    Note that since there are no entities in the following categories: LANGUAGE, LAW, NORP, and QUANTITY, these categories are not mentioned in the final output.

    Do Not forget to enclose all the entities in a single curly braces {}.

    ## Input ##

"""

In [20]:
@timeout_decorator.timeout(200,timeout_exception=StopIteration)
def llm_pipeline(x):

    prompt = INSTRUCTION + '\n' + x
    response = openai.ChatCompletion.create(
        #model = "gpt-4",  
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    print(response)
    try:
        res = response.choices[0].message.content
        return res
    except openai.error.RateLimitError:
        print("RateLimitError")
        return -1
    except:
        return None

In [19]:
def to_json_ner(x):
  categories = ["DATE","EVENT","FAC","GPE","LANGUAGE","LAW","LOC","MONEY","NORP","ORG","PERCENT","PERSON","PRODUCT","QUANTITY","TIME","WORK_OF_ART","ORDINAL","CARDINAL"]
  x = x.lstrip()
  x = x.rstrip()
  #lines = x.split('\n')
  entities = []
  char_rem = "{}''""[]#():"
  pattern = "[" + re.escape(char_rem) + "]"

  for category in categories:
    while x.find(category) != -1:
      x = x.lstrip()
      x = x.rstrip()
      req_x = x[x.find(category):]
      del_x = req_x
      if(req_x.find('\n')!=-1):
        del_x = req_x[req_x.find(category):req_x.find('\n')]
        req_x = req_x[req_x.find(category)+len(category)+1:req_x.find('\n')]
      else:
        del_x = req_x[req_x.find(category):]
        req_x = req_x[req_x.find(category)+len(category)+1:]
      x = x.replace(del_x,"")
      req_x = req_x.lstrip()
      req_x = req_x.rstrip()
      if(len(req_x) == 0):
        continue
      items = req_x
      if(req_x.find('#')!=-1):
        items = req_x.split('#')
      else:
        items = req_x.split(',')
      for itm in items:
        if itm.lower() == 'non' or itm.lower() == 'none' or len(itm)==0 or itm == '' or itm == None:
          continue
        itm = re.sub(pattern, "", itm)
        itm = itm.lstrip()
        itm = itm.rstrip()
        tmp_dictionary = {
          "entity": itm,"category":category
          }
        entities.append(tmp_dictionary)
  print(entities)
  return entities

In [21]:
def invoke_LLM(input_, chunk_):
    docs_rec = []
    if(not chunk_):
       docs_rec.append(input_)
    else:
        ## Chunk the input into different segments:
        text_splitter_rec = RecursiveCharacterTextSplitter(
            chunk_size = 8000,
            chunk_overlap  = 20
        )
        docs_split = text_splitter_rec.create_documents([input_])

        for doc in docs_split:
            docs_rec.append(doc.page_content)

    entities_dictionary = []
    responses = []
    ## For each document, call the LLM pipeline:
    for i in range(0, len(docs_rec)):
        print("Length of chunks: ", len(docs_rec))
        split_input = docs_rec[i]
        split_input = split_input.lstrip()
        split_input = split_input.rstrip()
        response = None
        try:
            response = llm_pipeline(split_input)
        except StopIteration as err:
            print("Timeout Error, trying again...")
            for i in range(3):
                try:
                    time.sleep(20)
                    response = llm_pipeline(split_input)
                except Exception as e:
                     print(e, "Trying again in 20sec...")
            if response == None:
                print("Tried to call the API 3 times but facing Timeout... Moving on to the next chunk.")
                continue
            if response == -1:
                return -1,-1
        except Exception as e:
            print(e, "Trying again in 20sec...")
            for i in range(3):
                try:
                    time.sleep(20)
                    response = llm_pipeline(split_input)
                except Exception as e:
                     print(e, "Trying again in 20sec...")
            if response == None:
                print("Couldn't process the Task. Moving to next chunk....")
                continue
            if response == -1:
                return -1,-1
        responses.append(response)
        entities_dictionary.extend(to_json_ner(response))
        time.sleep(25)
    return responses,entities_dictionary

In [12]:
## Test sample to invoke LLM:
res_N = None
try:
    res_N = llm_pipeline("""Bristol-Myers Squibb Company (which may be referred to as Bristol-Myers Squibb, BMS, the Company, we, our or us) is a global specialty biopharmaceutical company whose mission is to discover, develop and deliver innovative medicines that help patients prevail over serious diseases.

    We continue to evolve our business to a leading diversified specialty biopharma company. The evolution was accelerated as a result of the diabetes business divestiture and continued focus on certain therapeutic areas, including immuno-oncology. The following provides a brief summary of certain key events in 2014 ,as discussed in more detail throughout this report.

    Opdivo was approved in the U.S. and Japan for unresectable or metastatic melanoma, and we announced positive results from certain other studies in melanoma, lung, Hodgkin Lymphoma and renal cell carcinoma. Several clinical collaborations were also entered into by us to seek opportunities to strategically combine Opdivo with other targeted agents in more than a dozen tumor types. Eliquis obtained an important label extension in 2014. We received regulatory approvals for our Hepatitis C Franchise, including Daklinza in the EU and our dual regimen of Daklinza and Sunvepra in Japan. Several business development transactions were completed in 2014, to advance our pipeline in other therapeutic areas, including fibrosis and genetically defined diseases. We are also expanding our biologics manufacturing capacity at Devens, Massachusetts and announced plans to build a new facility in Ireland.

    Our revenues decreased by 3% in 2014 as a result of the diabetes business divestiture, exclusivity losses and expiration of rights partially offset by higher sales of key products, including recently launched products in certain markets. Our focus to optimize global brands and key markets accelerated growth of several key products. Eliquis sales increased in 2014 by $628 million following its global launch in 2013. Yervoy sales increased by 36%, or $348 million, from continued penetration in the U.S. community-based setting and first line indication and improved access internationally. Hepatitis C Franchise sales were $256 million following launches in Japan and certain EU countries. We expect these products will continue to grow in 2015 along with Orencia, Sprycel and recently launched Opdivo which will partially offset revenue reductions resulting from the expiration of certain rights pertaining to Abilify* in the U.S., royalty and alliance agreements, exclusivity losses for Baraclude in the U.S. and changes in foreign currency rates.

    Higher pension and research and development related charges contributed to the reduction of GAAP EPS from $1.54 in 2013 to $1.20 in 2014. Non-GAAP EPS increased from $1.82 to $1.85. Proceeds from the diabetes divestiture increased cash and marketable securities by $3.5 billion.

    """)
except StopIteration as err:
    print(err) 

{
  "id": "chatcmpl-8RV3CMlEgWlt5U8DyH2nj9MmvuE1Y",
  "object": "chat.completion",
  "created": 1701564906,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "{CARDINAL#3 \n\nDATE#2014#2013 \n\nEVENT#diabetes business divestiture \n\nFAC#Devens\n\nGPE#U.S.#Japan#EU#Ireland#Massachusetts \n\nMONEY#$628 million#$348 million#$256 million#$3.5 billion \n\nNORP#Japanese \n\nORG#Bristol-Myers Squibb Company#Bristol-Myers Squibb#BMS#Opdivo#Eliquis#Hepatitis C Franchise#Daklinza#Sunvepra#Yervoy#Orencia#Sprycel#Abilify#Baraclude \n\nPERCENT#3%#36% \n\nPERSON#Hodgkin Lymphoma \n\nPRODUCT#diabetes \n\nWORK_OF_ART#Abilify*}"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 1217,
    "completion_tokens": 169,
    "total_tokens": 1386
  },
  "system_fingerprint": null
}


In [8]:
print(res_N)

{DATE#2014#2013#2015

EVENT#diabetes business divestiture

GPE#U.S.#Japan#EU#Ireland#Massachusetts

MONEY#$628 million#$348 million#$256 million#$3.5 billion#1.54#1.20#1.82#1.85

ORG#Bristol-Myers Squibb Company#Bristol-Myers Squibb#BMS#Opdivo#Eliquis#Hepatitis C Franchise#Daklinza#Sunvepra#Yervoy#Orencia#Sprycel#Abilify*#Baraclude

PERCENT#3%#36%

PRODUCT#Opdivo#Eliquis#Daklinza#Sunvepra#Yervoy#Orencia#Sprycel#Hepatitis C Franchise

}


In [16]:
print(res_N)

{CARDINAL#3%#36%#2013#2014#2015 

DATE#2014#2013

EVENT#diabetes business divestiture

GPE#U.S.#Japan#EU#Devens#Massachusetts#Ireland

MONEY#$628 million#$348 million#$256 million#$3.5 billion#1.54#$1.20#$1.82#$1.85

ORG#Bristol-Myers Squibb Company#Bristol-Myers Squibb#BMS#Opdivo#Eliquis#Daklinza#Sunvepra#Hepatitis C Franchise#Yervoy#Orencia#Sprycel#Abilify*#Baraclude

PERCENT#3%

PRODUCT#Eliquis#Yervoy#Hepatitis C Franchise#Opdivo#Orencia#Sprycel#Abilify#Baraclude

}


In [9]:
print(to_json_ner(res_N))

[{'entity': '2014', 'category': 'DATE'}, {'entity': '2013', 'category': 'DATE'}, {'entity': '2015', 'category': 'DATE'}, {'entity': 'diabetes business divestiture', 'category': 'EVENT'}, {'entity': 'U.S.', 'category': 'GPE'}, {'entity': 'Japan', 'category': 'GPE'}, {'entity': 'EU', 'category': 'GPE'}, {'entity': 'Ireland', 'category': 'GPE'}, {'entity': 'Massachusetts', 'category': 'GPE'}, {'entity': '$628 million', 'category': 'MONEY'}, {'entity': '$348 million', 'category': 'MONEY'}, {'entity': '$256 million', 'category': 'MONEY'}, {'entity': '$3.5 billion', 'category': 'MONEY'}, {'entity': '1.54', 'category': 'MONEY'}, {'entity': '1.20', 'category': 'MONEY'}, {'entity': '1.82', 'category': 'MONEY'}, {'entity': '1.85', 'category': 'MONEY'}, {'entity': 'Bristol-Myers Squibb Company', 'category': 'ORG'}, {'entity': 'Bristol-Myers Squibb', 'category': 'ORG'}, {'entity': 'BMS', 'category': 'ORG'}, {'entity': 'Opdivo', 'category': 'ORG'}, {'entity': 'Eliquis', 'category': 'ORG'}, {'entity

Extracting the NERs

In [22]:
cwd = os.getcwd()
pwd = os.path.join(os.path.dirname(cwd), os.path.join('data','10-Ks'))

root_directory = pwd
target_file = 'Management.txt'
gpt_response_file = 'Management_' + str(model) + '_' +str(task) + '_response.json'
gpt_result_file = 'Management_' + str(model) + '_' +str(task) + '_results.json'

for root, directories, files in os.walk(root_directory):
    for directory in directories:
        folder_path = os.path.join(root, directory)
        files = os.listdir(folder_path)
        if gpt_result_file in files:
            continue
        if target_file in files:
            file_pth = os.path.join(folder_path, target_file)
            print(file_pth)
            try:
                with open(file_pth, 'r', encoding = 'utf-8') as rf:
                    input_ = rf.read()
                    input_ = input_.replace("\n","")
                    res, ent_dict = invoke_LLM(input_,chunk_)
                    #print(res)
                    output_file_path_responses = os.path.join(folder_path,gpt_response_file)
                    #print(output_file_path_responses)
                    with open(output_file_path_responses,"w") as jsonfile1:
                        try:
                            json.dump(res,jsonfile1, indent=4)
                        except(err):
                            print(err)
                    
                    output_file_path = os.path.join(folder_path,gpt_result_file)
                    if len(res)<=5 or res == None:
                        continue
                    if ent_dict == {}:
                        continue
                    if res == -1 and ent_dict == -1:
                        print("Rate Limit Exceeded. Sleep Mode....!!!")
                        exit
                    with open(output_file_path,"w") as jsonfile:
                        try:
                            json.dump(ent_dict,jsonfile, indent=4)
                        except(err):
                            print(err) 
            except UnicodeDecodeError as err:
                print(err)
            rf.close()

/home/damir/Dropbox/Documents/Teaching/2023/L715/l715_23/data/10-Ks/Rocket_Lab_USA_Inc/2022/Management.txt
Length of chunks:  8
{
  "id": "chatcmpl-8RV6Dexpcu0YMzOyd2rEbElLHS1ne",
  "object": "chat.completion",
  "created": 1701565093,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "{CARDINAL#7#10#109#20#2021#8,000#2029\n\nDATE#December 31, 2021#March 2021#August 25, 2021#May 7, 2021#June 25, 2021\n\nEVENT#Business Combination#First Merger\n\nFAC#International Space Station\n\nGPE#United States#Cayman Islands#State of Delaware\n\nLAW#Merger Agreement\n\nORG#Rocket Lab#Electron#U.S. government#Neutron#Photon#Sinclair Interplanetary#Planetary Systems Corporation#SolAero Holdings#ASI Aerospace LLC#Vector Acquisition Corporation#Vector#Legacy Rocket Lab#Nasdaq\n\nPERSON#Management\n\nPRODUCT#3D printed electric turbo-pump rocket engines#fully carbon composite first stage fuel tanks#a private orbital launch 

In [6]:
# cwd = os.getcwd()
# pwd = os.path.join(os.path.dirname(cwd), os.path.join('data','10-Ks'))

# root_directory = pwd
# target_file = 'Management.txt'
# gpt_response_file = 'Management_' + str(model) + '_' +str(task) + '_response.json'
# gpt_result_file = 'Management_' + str(model) + '_' +str(task) + '_results.json'

# for root, directories, files in os.walk(root_directory):
#     for directory in directories:
#         folder_path = os.path.join(root, directory)
#         files = os.listdir(folder_path)
#         if gpt_response_file in files:
#             entities_dictionary = []
#             file_pth = os.path.join(folder_path, gpt_response_file)
#             print(file_pth)
#             with open(file_pth) as rf:
#                 input_ = json.load(rf)
#                 for inp in input_:
#                     entities_dictionary.extend(to_json_ner(inp))
#             output_file_path = os.path.join(folder_path, gpt_result_file)
#             with open(output_file_path,"w") as jsonfile:
#                 try:
#                     json.dump(entities_dictionary,jsonfile, indent=4)
#                 except(err):
#                     print(err)


/Users/tanmayiballa/Desktop/KG-LLM/l715_23-1/data/10-Ks/Bristol-Myers_Squibb_Company/2014/Management_gpt-3.5-turbo_NER_response.json
[{'entity': 'U.S.', 'category': 'GPE'}, {'entity': 'United States', 'category': 'GPE'}, {'entity': 'the UK', 'category': 'GPE'}, {'entity': 'Germany', 'category': 'GPE'}, {'entity': 'Europe', 'category': 'GPE'}, {'entity': 'Bristol-Myers Squibb Company', 'category': 'ORG'}, {'entity': 'Bristol-Myers Squibb', 'category': 'ORG'}, {'entity': 'BMS', 'category': 'ORG'}, {'entity': 'Company', 'category': 'ORG'}]
[{'entity': 'May 2012', 'category': 'DATE'}, {'entity': '2013', 'category': 'DATE'}, {'entity': 'February 2014', 'category': 'DATE'}, {'entity': 'the last three years', 'category': 'DATE'}, {'entity': 'U.S.', 'category': 'GPE'}, {'entity': 'United States', 'category': 'GPE'}, {'entity': 'approximately $3 billion', 'category': 'GPE'}, {'entity': 'late', 'category': 'GPE'}, {'entity': 'Devens', 'category': 'LOC'}, {'entity': 'Massachusetts', 'category': '