# Pulling KeyPhrases from LLM (openai)

##### Date: 11-13-2023
##### Author: Kishan Rathore (krathore@iu.edu)

###### Naming Convention for the output files:
###### task_names: Abbreviations: ABBR, Named Entities: NER, KeyPhrases: KP
###### LLM Responses: "Management_<model_name>_<task_name>_response.json"
###### Extracted Results: "Management_<model_name>_<task_name>_results.json"
###### Cleaned Results: "Management_<model_name>_<task_name>_results_cleaned.json"

In [None]:
# !pip install secret, timeout_decorator, openai, langchain

In [1]:
import os
import re
import openai
import time
import json
import datetime
import timeout_decorator
from langchain.text_splitter import RecursiveCharacterTextSplitter
import secret

In [2]:
openai.api_key = secret.openai_api_key # "<Your API-KEY>"
model = "gpt-4" # "gpt-3.5-turbo"
# model = "gpt-3.5-turbo"
#model = "gpt-4-1106-preview"
task = 'KP'

chunk_ = 1
if model in [ 'gpt-4-1106-preview' ]:
    chunk_ = 0

In [3]:
@timeout_decorator.timeout(200,timeout_exception=StopIteration)
def llm_pipeline(x):
    messages = [
        {
            "role": "system",
            "content": """You are a Natural Language processing expert. Your job is to extract all the key phrases from the given text using the Text Rank algorithm, If there are no key phrases in the text, simply output "No KeyPhrases found". Here is an example: Sample Input: "The food was delicious and the staff were wonderful.", key phrase extraction will return the main topics: "food" and "wonderful staff". Expected Output: {"food", "wonderful staff"}."""
        },
        {
            "role": "user",
            "content": x
        } ]
    functions = [ {
        "name": "extract_keyphrase",
        "description": "extract key phrases from all of the sentences in the text",
        "parameters": {
            "type": "object",
            'properties':{
                "key_phrases":{
                    "type": "array",
                    "items": {
                        "type": "string",
                    },
                    "description": "extract key phrases from all of the sentences in the text in list. Do no make up any data."
                }                   
            }
    }} ]
    response = openai.ChatCompletion.create(
        model = model, #"gpt-3.5-turbo-0613",
        messages=messages,
        functions=functions,
        function_call="auto", 
        temperature= 0
    )
    print(response)
    try:
        res = json.loads(response['choices'][0]['message']['function_call']['arguments'])['key_phrases']
        return res
    except openai.error.RateLimitError:
        print("RateLimitError")
        return -1
    except:
        return None

In [4]:
def invoke_LLM(input_, chunk_ = 1):
    docs_rec = []
    if(not chunk_):
       docs_rec.append(input_)
    else:
        ## Chunk the input into different segments:
        text_splitter_rec = RecursiveCharacterTextSplitter(
            chunk_size = 7000,
            chunk_overlap  = 20
        )
        docs_split = text_splitter_rec.create_documents([input_])

        for doc in docs_split:
            docs_rec.append(doc.page_content)

    responses = []
    ## For each document, call the LLM pipeline:
    for i in range(0, len(docs_rec)):
        split_input = docs_rec[i]
        split_input = split_input.lstrip()
        split_input = split_input.rstrip()
        response = None
        try:
            response = llm_pipeline(split_input)
        except StopIteration as err:
            print("Timeout Error, trying again...")
            for i in range(3):
                try:
                    time.sleep(20)
                    response = llm_pipeline(split_input)
                except Exception as e:
                     print(e, "Trying again in 20sec...")
            if response == None:
                print("Tried to call the API 3 times but facing Timeout... Moving on to the next chunk.")
                continue
            if response == -1:
                return -1
        except Exception as e:
            print(e, "Trying again in 20sec...")
            for i in range(3):
                try:
                    time.sleep(20)
                    response = llm_pipeline(split_input)
                except Exception as e:
                     print(e, "Trying again in 20sec...")
            if response == None:
                print("Couldn't process the Task. Moving to next chunk....")
                continue
            if response == -1:
                return -1
        responses.append(response)
        time.sleep(25)
    return responses

In [None]:
## Test sample to invoke LLM:

try:
    invoke_LLM("""Bristol-Myers Squibb Company (which may be referred to as Bristol-Myers Squibb, BMS, the Company, we, our or us) is a global specialty biopharmaceutical company whose mission is to discover, develop and deliver innovative medicines that help patients prevail over serious diseases.

    We continue to evolve our business to a leading diversified specialty biopharma company. The evolution was accelerated as a result of the diabetes business divestiture and continued focus on certain therapeutic areas, including immuno-oncology. The following provides a brief summary of certain key events in 2014 ,as discussed in more detail throughout this report.

    Opdivo was approved in the U.S. and Japan for unresectable or metastatic melanoma, and we announced positive results from certain other studies in melanoma, lung, Hodgkin Lymphoma and renal cell carcinoma. Several clinical collaborations were also entered into by us to seek opportunities to strategically combine Opdivo with other targeted agents in more than a dozen tumor types. Eliquis obtained an important label extension in 2014. We received regulatory approvals for our Hepatitis C Franchise, including Daklinza in the EU and our dual regimen of Daklinza and Sunvepra in Japan. Several business development transactions were completed in 2014, to advance our pipeline in other therapeutic areas, including fibrosis and genetically defined diseases. We are also expanding our biologics manufacturing capacity at Devens, Massachusetts and announced plans to build a new facility in Ireland.

    Our revenues decreased by 3% in 2014 as a result of the diabetes business divestiture, exclusivity losses and expiration of rights partially offset by higher sales of key products, including recently launched products in certain markets. Our focus to optimize global brands and key markets accelerated growth of several key products. Eliquis sales increased in 2014 by $628 million following its global launch in 2013. Yervoy sales increased by 36%, or $348 million, from continued penetration in the U.S. community-based setting and first line indication and improved access internationally. Hepatitis C Franchise sales were $256 million following launches in Japan and certain EU countries. We expect these products will continue to grow in 2015 along with Orencia, Sprycel and recently launched Opdivo which will partially offset revenue reductions resulting from the expiration of certain rights pertaining to Abilify* in the U.S., royalty and alliance agreements, exclusivity losses for Baraclude in the U.S. and changes in foreign currency rates.

    Higher pension and research and development related charges contributed to the reduction of GAAP EPS from $1.54 in 2013 to $1.20 in 2014. Non-GAAP EPS increased from $1.82 to $1.85. Proceeds from the diabetes divestiture increased cash and marketable securities by $3.5 billion.

    """)
except StopIteration as err:
    print(err) 

Extracting the KeyPhrases

In [5]:
cwd = os.getcwd()
pwd = os.path.join(os.path.dirname(cwd), os.path.join('data','10-Ks'))

root_directory = pwd
target_file = 'Management.txt'
gpt_response_file = 'Management_' + str(model) + '_' +str(task) + '_response.json'
gpt_result_file = 'Management_' + str(model) + '_' +str(task) + '_results.json'

for root, directories, files in os.walk(root_directory):
    for directory in directories:
        folder_path = os.path.join(root, directory)
        files = os.listdir(folder_path)
        if gpt_result_file in files:
            continue
        if target_file in files:
            file_pth = os.path.join(folder_path, target_file)
            print(file_pth)
            try:
                with open(file_pth, 'r', encoding = 'utf-8') as rf:
                    input_ = rf.read()
                    input_ = input_.replace("\n","")
                    res = invoke_LLM(input_, chunk_)
                    
                    if res == -1:
                        print("Rate Limit Exceeded. Sleep Mode....!!!")
                        exit
                    
                    output_file_path_responses = os.path.join(folder_path,gpt_response_file)
                    
                    if len(res)<=5 or res == None:
                        continue
                    with open(output_file_path_responses,"w") as jsonfile1:
                        try:
                            json.dump(res,jsonfile1, indent=4)
                        except(err):
                            print(err)    
            except UnicodeDecodeError as err:
                print(err)
            rf.close()

/home/damir/Dropbox/Documents/Teaching/2023/L715/l715_23/data/10-Ks/Rocket_Lab_USA_Inc/2022/Management.txt
{
  "id": "chatcmpl-8RYJbz4LCZtaA6NrqEnSuBkIpDkbV",
  "object": "chat.completion",
  "created": 1701577455,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": null,
        "function_call": {
          "name": "extract_keyphrase",
          "arguments": "{\n\"key_phrases\": [\"Management\u2019s Discussion and Analysis of Financial Condition and Results of Operations\", \"consolidated results of operations and financial condition\", \"consolidated financial statements\", \"Annual Report on Form 10-K\", \"forward-looking statements\", \"risks and uncertainties\", \"Rocket Lab\", \"end-to-end space company\", \"launch services\", \"spacecraft design services\", \"spacecraft components\", \"spacecraft manufacturing\", \"on-orbit management solutions\", \"space data applications\", \"Launch Services\", \"Sp

Extracting the results into a json file

In [2]:
def to_json_kp(x):
    final_list = []
    for i in x:
        if len(i)<=5:
            continue
        final_list.extend(i)
    return final_list

Extracting the KP

In [6]:
cwd = os.getcwd()
pwd = os.path.join(os.path.dirname(cwd), os.path.join('data','10-Ks'))

root_directory = pwd
target_file = 'Management_' + str(model) + '_' +str(task) + '_response.json'
res_target = 'Management_' + str(model) + '_' +str(task) + '_results.json'

for root, directories, files in os.walk(root_directory):
    for directory in directories:
        folder_path = os.path.join(root, directory)
        files = os.listdir(folder_path)
        if target_file in files:
            file_pth = os.path.join(folder_path, target_file)
            print(file_pth)
            file_  = open(file_pth)
            cleaned_res = to_json_kp(json.load(file_))
            output_file_path_ = os.path.join(folder_path,res_target)
            with open(output_file_path_,"w") as jsonfile:
                try:
                    json.dump(cleaned_res,jsonfile, indent=4)
                except(err):
                    print(err)

/Users/tanmayiballa/Downloads/KG-LLM-test/l715_23/data/10-Ks/Aerojet_Rocketdyne_Holdings_Inc./2013/Management_gpt-3.5-turbo_KP_response.json
