# Pulling Abbreviations from LLM (openai)

##### Date: 11-13-2023
##### Author: Tanmayi Balla (tballa@iu.edu)

###### Naming Convention for the output files:
###### task_names: Abbreviations: ABBR, Named Entities: NER, KeyPhrases: KP
###### LLM Responses: "Management_<model_name>_<task_name>_response.json"
###### Extracted Results: "Management_<model_name>_<task_name>_results.json"
###### Cleaned Results: "Management_<model_name>_<task_name>_results_cleaned.json"

In [None]:
# !pip install secret, timeout_decorator, openai, langchain

In [1]:
import os
import re
import openai
import time
import json
import timeout_decorator
from langchain.text_splitter import RecursiveCharacterTextSplitter
import secret
import datetime

Store all keys in your private `secret.py` file. This file is not synced with GitHub as specified in `.gitignore`.

In [10]:
openai.api_key = secret.openai_api_key # "<Your API-KEY>"
# model = "gpt-3.5-turbo"
# model = "gpt-4" # "gpt-3.5-turbo"
model = "gpt-4-1106-preview"
task = 'ABBR'

chunk_ = 1
if model == 'gpt-4':
    chunk_ = 0

In [5]:
@timeout_decorator.timeout(200,timeout_exception=StopIteration)
def llm_pipeline(x):

    messages = [
        {
            "role": "system",
            "content": """You are a Natural Language processing expert. Your task is to identify the abbreviations from the given input and extract them with their full form. If there are no abbreviations in the text, simply output "No Abbreviations found". Here is an example: Sample Input: "The Society of American Archivists (SAA) defines an archive as: An organization that collects the records of individuals, families, or other organizations. Archives may be institutional (e.g., United Nations Archives)". The expected output is SSA and Society of American Archivists as it's full form."""
        },
        {
            "role": "user",
            "content": x
        }
    ]

    functions = [
    {
        "name": "extract_abbreviations",
        "description": "Extract all abbreviations and their full form from all of the sentences in the text in the format abbreviation#full-form",
        "parameters": {
            "type": "object",
            'properties':{
                "abbreviations":{
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            'abbreviation': {"type": "string"},
                            "full_form": {"type": "string"}
                        },"description": "Extract all abbreviations and their full form from all of the sentences in the text in list. Do not make up any data and find abbreviation only from the given text. abbreviation: abbreviation found in the text. full_form: the full form of the abbreviation."
                    }
                }
            }
    }}
    ]
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        functions=functions,
        function_call="auto", 
        temperature= 0
    )
    print(response)
    try:
        res = json.loads(response['choices'][0]['message']['function_call']['arguments'])['abbreviations']
        return res
    except openai.error.RateLimitError:
        print("RateLimitError")
        return -1
    except:
        return None

In [6]:
## Converting LLM responses into a json dictionary format.

def to_json_abr(x):
  if x == None:
    return {}
  # x = x.lstrip()
  # x = x.rstrip()
  # abbr = []

  # print(x)

  # if (x.find('No Abbreviations found')!=-1):
  #   return {}
  # req_x = x
  # #req_x = x[x.find('{'):]
  # #req_x = req_x[:-1]
  # req_x = req_x.lstrip()
  # req_x = req_x.rstrip()
  # req_x = req_x.replace('{',"")
  # req_x = req_x.replace('}',"")

  # while (req_x.find('\n')!=-1):
  #   tmp_x = req_x[:req_x.find('\n')]
  #   req_x = req_x[req_x.find('\n')+1:]
  #   if tmp_x.find('#')==-1:
  #     continue
  #   items = tmp_x.split('#')
  #   if len(items)!=2:
  #     continue
  #   abbr.append({
  #     "ABBR": items[0],
  #     "DEF": items[1]
  #   })
    
  # tmp_x = req_x
  # if tmp_x.find('#')!=-1:
  #   items = tmp_x.split('#')
  #   if len(items)==2:
  #     abbr.append({
  #       "ABBR": items[0],
  #       "DEF": items[1]
  #     })
  # print(abbr)
  
  abbr = []
  
  for r in x:
    abbr.append({
        "ABBR": r['abbreviation'],
        "DEF": r['full_form']})
  print(abbr)
  return abbr

In [7]:
def invoke_LLM(input_, chunk_):
    docs_rec = []
    if(not chunk_):
       docs_rec.append(input_)
    else:
        ## Chunk the input into different segments:
        text_splitter_rec = RecursiveCharacterTextSplitter(
            chunk_size = 7000,
            chunk_overlap  = 20
        )
        docs_split = text_splitter_rec.create_documents([input_])

        for doc in docs_split:
            docs_rec.append(doc.page_content)

    entities_dictionary = []
    responses = []
    ## For each document, call the LLM pipeline:
    for i in range(0, len(docs_rec)):
        split_input = docs_rec[i]
        split_input = split_input.lstrip()
        split_input = split_input.rstrip()
        response = None
        try:
            response = llm_pipeline(split_input)
        except StopIteration as err:
            print("Timeout Error, trying again...")
            for i in range(3):
                try:
                    time.sleep(20)
                    response = llm_pipeline(split_input)
                except Exception as e:
                     print(e, "Trying again in 20sec...")
            if response == None:
                print("Tried to call the API 3 times but facing Timeout... Moving on to the next chunk.")
                continue
            if response == -1:
                return -1,-1
        except Exception as e:
            print(e, "Trying again in 20sec...")
            for i in range(3):
                try:
                    time.sleep(20)
                    response = llm_pipeline(split_input)
                except Exception as e:
                     print(e, "Trying again in 20sec...")
            if response == None:
                print("Couldn't process the Task. Moving to next chunk....")
                continue
            if response == -1:
                return -1,-1
        responses.append(response)
        entities_dictionary.extend(to_json_abr(response))
        time.sleep(25)
    return responses,entities_dictionary

In [8]:
## Test sample to invoke LLM:

try:
    invoke_LLM("""Bristol-Myers Squibb Company (which may be referred to as Bristol-Myers Squibb, BMS, the Company, we, our or us) is a global specialty biopharmaceutical company whose mission is to discover, develop and deliver innovative medicines that help patients prevail over serious diseases.

    We continue to evolve our business to a leading diversified specialty biopharma company. The evolution was accelerated as a result of the diabetes business divestiture and continued focus on certain therapeutic areas, including immuno-oncology. The following provides a brief summary of certain key events in 2014 ,as discussed in more detail throughout this report.

    Opdivo was approved in the U.S. and Japan for unresectable or metastatic melanoma, and we announced positive results from certain other studies in melanoma, lung, Hodgkin Lymphoma and renal cell carcinoma. Several clinical collaborations were also entered into by us to seek opportunities to strategically combine Opdivo with other targeted agents in more than a dozen tumor types. Eliquis obtained an important label extension in 2014. We received regulatory approvals for our Hepatitis C Franchise, including Daklinza in the EU and our dual regimen of Daklinza and Sunvepra in Japan. Several business development transactions were completed in 2014, to advance our pipeline in other therapeutic areas, including fibrosis and genetically defined diseases. We are also expanding our biologics manufacturing capacity at Devens, Massachusetts and announced plans to build a new facility in Ireland.

    Our revenues decreased by 3% in 2014 as a result of the diabetes business divestiture, exclusivity losses and expiration of rights partially offset by higher sales of key products, including recently launched products in certain markets. Our focus to optimize global brands and key markets accelerated growth of several key products. Eliquis sales increased in 2014 by $628 million following its global launch in 2013. Yervoy sales increased by 36%, or $348 million, from continued penetration in the U.S. community-based setting and first line indication and improved access internationally. Hepatitis C Franchise sales were $256 million following launches in Japan and certain EU countries. We expect these products will continue to grow in 2015 along with Orencia, Sprycel and recently launched Opdivo which will partially offset revenue reductions resulting from the expiration of certain rights pertaining to Abilify* in the U.S., royalty and alliance agreements, exclusivity losses for Baraclude in the U.S. and changes in foreign currency rates.

    Higher pension and research and development related charges contributed to the reduction of GAAP EPS from $1.54 in 2013 to $1.20 in 2014. Non-GAAP EPS increased from $1.82 to $1.85. Proceeds from the diabetes divestiture increased cash and marketable securities by $3.5 billion.

    """, 1)
except StopIteration as err:
    print(err) 

{
  "id": "chatcmpl-8PiVphKTvhcAD824vdd8GEXHkgIev",
  "object": "chat.completion",
  "created": 1701139997,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": null,
        "function_call": {
          "name": "extract_abbreviations",
          "arguments": "{\n  \"abbreviations\": [\n    {\n      \"abbreviation\": \"BMS\",\n      \"full_form\": \"Bristol-Myers Squibb\"\n    },\n    {\n      \"abbreviation\": \"GAAP\",\n      \"full_form\": \"Generally Accepted Accounting Principles\"\n    },\n    {\n      \"abbreviation\": \"EPS\",\n      \"full_form\": \"Earnings Per Share\"\n    },\n    {\n      \"abbreviation\": \"EU\",\n      \"full_form\": \"European Union\"\n    },\n    {\n      \"abbreviation\": \"U.S.\",\n      \"full_form\": \"United States\"\n    }\n  ]\n}"
        }
      },
      "finish_reason": "function_call"
    }
  ],
  "usage": {
    "prompt_tokens": 791,
    "completion_tokens": 131,
   

In [11]:
## Extracting the Abbreviations

cwd = os.getcwd()
pwd = os.path.join(os.path.dirname(cwd), os.path.join('data','10-Ks'))

root_directory = pwd
target_file = 'Management.txt'
gpt_response_file = 'Management_' + str(model) + '_' +str(task) + '_response.json'
gpt_result_file = 'Management_' + str(model) + '_' +str(task) + '_results.json'


for root, directories, files in os.walk(root_directory):
    for directory in directories:
        folder_path = os.path.join(root, directory)
        files = os.listdir(folder_path)
        if gpt_result_file in files:
            continue
        if target_file in files:
            file_pth = os.path.join(folder_path, target_file)
            print(file_pth)
            try:
                with open(file_pth, 'r', encoding = 'utf-8') as rf:
                    input_ = rf.read()
                    input_ = input_.replace("\n","")
                    res, ent_dict = invoke_LLM(input_,chunk_)
                    
                    output_file_path = os.path.join(folder_path,gpt_result_file)
                    output_file_path_responses = os.path.join(folder_path,gpt_response_file)
                    
                    if len(res)<=5 or res == None:
                        continue
                    if ent_dict == {}:
                        continue
                    if res == -1 and ent_dict == -1:
                        print("Rate Limit Exceeded. Sleep Mode....!!!")
                        exit
                    with open(output_file_path,"w") as jsonfile:
                        try:
                            json.dump(ent_dict,jsonfile, indent=4)
                        except(err):
                            print(err)
                    with open(output_file_path_responses,"w") as jsonfile1:
                        try:
                            json.dump(res,jsonfile1, indent=4)
                        except(err):
                            print(err)    
            except UnicodeDecodeError as err:
                print(err)
            rf.close()

/home/damir/Dropbox/Documents/Teaching/2023/L715/l715_23/data/10-Ks/Rocket_Lab_USA_Inc/2022/Management.txt
{
  "id": "chatcmpl-8PikwWyTUdHESkrcDQXFICUILObOK",
  "object": "chat.completion",
  "created": 1701140934,
  "model": "gpt-4-1106-preview",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": null,
        "function_call": {
          "name": "extract_abbreviations",
          "arguments": "{\"abbreviations\":[{\"abbreviation\":\"SAA\",\"full_form\":\"Society of American Archivists\"}]}"
        }
      },
      "finish_reason": "function_call"
    }
  ],
  "usage": {
    "prompt_tokens": 1481,
    "completion_tokens": 32,
    "total_tokens": 1513
  },
  "system_fingerprint": "fp_a24b4d720c"
}
[{'ABBR': 'SAA', 'DEF': 'Society of American Archivists'}]
{
  "id": "chatcmpl-8PilNtiHAWNhe0GafQ7J118UAiBiz",
  "object": "chat.completion",
  "created": 1701140961,
  "model": "gpt-4-1106-preview",
  "choices": [
    {
      "index": 0

In [31]:
def cleanup_llm_responses_abr(x):
    print(x)
    cleaned_dict = {}
    for dict in x:
        nq = 1
        ABBR = dict['ABBR']
        DEF = dict['DEF']
        char_rem = "{}''""[]#():"
        pattern = "[" + re.escape(char_rem) + "]"
        ABBR, DEF = re.sub(pattern, "", ABBR), re.sub(pattern, "", DEF)
        if len(ABBR)==0 or len(DEF)==0:
            continue
        for ch in ABBR:
            if ch == '.' or ch =='-':
                continue
            if DEF.find(ch)==-1:
                nq = 0
                break
        if nq == 0:
            continue
        if ABBR in DEF.split('\n'):
            continue
        if len(DEF) - len(ABBR)<3:
            continue
        if(ABBR in cleaned_dict.keys()):
            continue
        ABBR, DEF = ABBR.lstrip(), DEF.lstrip()
        ABBR, DEF = ABBR.rstrip(), DEF.rstrip()
        cleaned_dict[ABBR] = DEF
    return cleaned_dict

In [33]:
## Extracting the Abbreviations

cwd = os.getcwd()
pwd = os.path.join(os.path.dirname(cwd), os.path.join('data','10-Ks'))

root_directory = pwd

gpt_result_file = 'Management_' + str(model) + '_' +str(task) + '_results.json'
gpt_cleaned_result_file = 'Management_' + str(model) + '_' +str(task) + '_results_cleaned.json'

for root, directories, files in os.walk(root_directory):
    for directory in directories:
        folder_path = os.path.join(root, directory)
        files = os.listdir(folder_path)
        if target_file in files:
            file_pth = os.path.join(folder_path, gpt_result_file)
            print(file_pth)
            file_  = open(file_pth)
            cleaned_res = cleanup_llm_responses_abr(json.load(file_))
            output_file_path_cleaned = os.path.join(folder_path,gpt_cleaned_result_file)
            with open(output_file_path_cleaned,"w") as jsonfile:
                try:
                    json.dump(cleaned_res,jsonfile, indent=4)
                except(err):
                    print(err)

/Users/tanmayiballa/Downloads/KG-LLM-test/l715_23/data/10-Ks/Bristol-Myers_Squibb_Company/2014/Management_gpt-3.5-turbo_ABBR_results.json
[{'ABBR': 'EMA', 'DEF': 'European Medicines Agency'}, {'ABBR': 'FDA', 'DEF': 'Food and Drug Administration'}, {'ABBR': 'HIV', 'DEF': 'Human Immunodeficiency Virus'}, {'ABBR': 'PD-1', 'DEF': 'programmed death receptor-1'}, {'ABBR': 'DAA', 'DEF': 'direct-acting antiviral'}, {'ABBR': 'HCV', 'DEF': 'hepatitis C virus'}, {'ABBR': 'BMS', 'DEF': 'Bristol-Myers Squibb'}, {'ABBR': 'HBV', 'DEF': 'hepatitis B virus'}, {'ABBR': 'US', 'DEF': 'United States'}, {'ABBR': 'ASCO', 'DEF': 'American Society of Clinical Oncology'}, {'ABBR': 'NSCLC', 'DEF': 'non-small-cell lung cancer'}, {'ABBR': 'CML', 'DEF': 'chronic myeloid leukemia'}, {'ABBR': 'GIST', 'DEF': 'gastrointestinal stromal tumor'}, {'ABBR': 'AML', 'DEF': 'acute myeloid leukemia'}, {'ABBR': 'ALL', 'DEF': 'acute lymphoblastic leukemia'}, {'ABBR': 'CLL', 'DEF': 'chronic lymphocytic leukemia'}, {'ABBR': 'MM', '