In [1]:
from langchain import OpenAI, PromptTemplate
from langchain.callbacks import get_openai_callback
from dotenv import load_dotenv
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import re

load_dotenv()


True

In [64]:
# document_path = "../data_0611/OCR_output/XS2021832634.json"
document_path = "../data_0611/OCR_output/EU2320735_20230302_XS2566031444_Decentia-USD-COBA Memory-48M-.RUT+.STOXX-Final.json"


In [62]:
llm = OpenAI(
        model="gpt-3.5-turbo-instruct",
        temperature=0.5
    )


In [65]:
with open(document_path, 'r') as f:
    loaded_json = json.load(f)


In [66]:
json_key_value_pairs = loaded_json['key-value_pairs']


In [None]:
synonyms = ['Launch Date', 'Trade Date']


In [67]:
for i in synonyms:

    launch_date = json_key_value_pairs.get(i)

    if launch_date:

        break

try:
    launch_date = launch_date[0]
except TypeError as e:
    print(e)


In [68]:
launch_date


'2 March 2023'

In [69]:
full_text = loaded_json['full_text']


In [70]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator=" ")
texts = text_splitter.split_text(full_text)


In [71]:
embeddings = OpenAIEmbeddings()
db = FAISS.from_texts(texts, embeddings)


In [72]:
retriever = db.as_retriever(search_kwargs={"k": 4})
docs = retriever.get_relevant_documents("What is the launch date of this term sheet?")


In [73]:
prompt = PromptTemplate.from_template(
    "Take a deep breath and relax. Think step by step."
    "I have the following  document of a term sheet"
    "I need to find the Launch Date (which is the date that the product begins trading)"
    "The format of the launch date is usually in dd/mm/yyyy, mm/dd/yyyy, yyyy/mm/dd, or in natural language using the name of the month."
    "It is also sometimes called `Trade Date`"
    "It is really important that you get this right, because my life depends on it!\n"
    "Example:"
    "Context: Launch date 3rd July 2020"
    "Launch Date: 03/07/2020"
    "Example:"
    "Context: Launch date 2018.03.01"
    "Launch Date: 01/03/2018"

    "I am going to give you a chunk of text, and you need to tell me what is the launch date of the document\n\n"
    "You must return it in the correct date format dd/mm/yyyy and only return this value."

    "Context:{context}\n"
    "Launch Date: <your answer here>"
)


In [75]:
full_context = ''
for doc in docs:
    full_context += '\n' + doc.page_content
if launch_date:
    try:
        launch_date_confidence = json_key_value_pairs['Launch Date'][1]
    except KeyError:
        launch_date_confidence = json_key_value_pairs['Trade Date'][1]
    full_context += '\n' + f' I also did an OCR anaysis and found Launch Date: {launch_date} with a confidence of {launch_date_confidence}'


In [77]:
final_prompt = prompt.format(context=full_context)
result = llm.invoke(final_prompt)


In [85]:
result


'\n\nBased on the context and information provided, the launch date of the document is 3 March 2023. It is mentioned in the section "The Offering Prospectus as well as the Guarantee can be obtained free of charge electronically or physically at the offices of the Guarantor and Arranger’s branch in Switzerland" and is also mentioned as the "Payment Date" in the section "Summary of Terms and Conditions". The correct format for the launch date is dd/mm/yyyy, so the launch date is 03/03/2023.'

In [162]:
re_pattern = r'/.*?((0[1-9]|[12][0-9]|3[01])\/(0[1-9]|1[0-2])\/([0-9]{4})).*?'

matches = re.findall(re_pattern, result)


In [None]:
if matches:
    launch_date = matches[0][0]
