In [1]:
from langchain import OpenAI, PromptTemplate
from langchain.callbacks import get_openai_callback
from dotenv import load_dotenv
import json
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import re

load_dotenv()


True

In [5]:
llm = OpenAI(
        model="gpt-3.5-turbo-instruct",
        temperature=0.5
    )


In [2]:
from os import listdir

jsons = [i for i in listdir('/Users/waterdamage123/Documents/GitHub/ai-hackathon/data_0611/OCR_output') if i.split('.')[-1]=='json']


In [11]:
for i in jsons:

    print(i)

    document_path = f"../data_0611/OCR_output/{i}"

    with open(document_path, 'r') as f:
        loaded_json = json.load(f)

    json_key_value_pairs = loaded_json['key-value_pairs']

    synonyms = ['Final Valuation Date', 'Redemption Valuation Date', 'Redemption Date', 'Determination Date']

    for i in synonyms:

        final_val_date = json_key_value_pairs.get(i)

        if final_val_date:

            break

    try:
        final_val_date = final_val_date[0]
    except TypeError as e:
        print(e)

    print(final_val_date,'\n')


XS2444480540.json
August 31, 2023 

XS2424913601.json
28 February 2028 

XS2474851479.json
26 December 2023 

XS2033997748.json
May 30th, 2023 

XS2565974388_USD-TwinWin-18M-.SPX^M.STOXX50E-Final.json
22 August 2024 

XS2439264677.json
'NoneType' object is not subscriptable
None 

AB5PFT - BNP 2-Year USD Bonus Certificate Worst-Of on Euro Stoxx 50 Price EUR Swiss Market Index and SP 500.json
'NoneType' object is not subscriptable
None 

XS2493928639.json
29 January 2024 

XS2317910607.json
May 10th, 2023 

AB3ZFW - RBC 18-Month EUR Twin Win Autocallable on Societe Generale SA.json
11 July 2023 

XS2575193920 (TS) - 5y USD Autocallable Note on DAXK FTSEMIB IBEX (EN).json
8 February 2028 

XS2472707103.json
15 December 2023 

XS2355961215.json
'NoneType' object is not subscriptable
None 

XS2358486194.json
'NoneType' object is not subscriptable
None 

XS2333422389.json
'NoneType' object is not subscriptable
None 

XS2021832634.json
21.04.2023 

EU2320735_20230302_XS2566031444_Decentia-US

In [12]:
full_text = loaded_json['full_text']


In [13]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator=" ")
texts = text_splitter.split_text(full_text)


In [14]:
embeddings = OpenAIEmbeddings()
db = FAISS.from_texts(texts, embeddings)


In [15]:
retriever = db.as_retriever(search_kwargs={"k": 4})
docs = retriever.get_relevant_documents("What is the final valuation date, redemption date, or determination date of this term sheet?")


In [16]:
prompt = PromptTemplate.from_template(
    "Take a deep breath and relax. Think step by step."
    "I have the following  document of a term sheet"
    "I need to find the Final Valuation Date (which is the last day that the product trades or is valid.)"
    "The format of the Final Valuation Date is usually in dd/mm/yyyy, mm/dd/yyyy, yyyy/mm/dd, or in natural language using the name of the month."
    "It is also sometimes called `Determination Date` or `Redemption Valuation Date`"
    "It is really important that you get this right, because my life depends on it!\n"
    "Example:"
    "Context: Final Valuation Date 3rd July 2020"
    "Final Valuation Date: 03/07/2020"
    "Example:"
    "Context: Final Valuation Date 2018.03.01"
    "Final Valuation Date: 01/03/2018"

    "I am going to give you a chunk of text, and you need to tell me which one is the Final Valuation Date of the document\n\n"
    "You must return it in the correct date format: dd/mm/yyyy and only return this value"

    "Context:{context}\n"
    "Final Valuation Date: <your answer here>"
)


In [17]:
full_context = ''
for doc in docs:
    full_context += '\n' + doc.page_content
if final_val_date:
    try:
        final_val_date_confidence = json_key_value_pairs['Final Valuation Date'][1]
    except KeyError:
        final_val_date_confidence = json_key_value_pairs['Trade Date'][1]
    full_context += '\n' + f' I also did an OCR anaysis and found Final Valuation Date: {final_val_date} with a confidence of {final_val_date_confidence}'


In [25]:
print(full_context)



Underlying closes below its Knock-out Price on the Redemption Valuation Date, the conditional protection is terminated and the product will be redeemed by cash settlement at theofficial closing price of the Underlying on the Redemption Valuation Date. In the event that the Certificates are redeemed early as a consequence of force majeure, illegality or certain other events affecting the Underlying(s) and/or hedge, Holders may receive an amount per Certificate equal to the fair market value of each Certificate less the cost to the Issuer and/or its affiliates of unwinding any underlying related hedging arrangements. Such amount may be less than the Notional Amount or even zero.
Risk Tolerance
Holders of this product should be experienced and familiar with both derivative products and the Underlying. Holders must be willing to make an investment that is exposed to the full down-side risk of the Underlying. Holders do not require capital protection.
Date
March 10th, 2023
Strike Date
Marc

In [18]:
final_prompt = prompt.format(context=full_context)
result = llm.invoke(final_prompt)


In [19]:
re_pattern = r'((0[1-9]|[12][0-9]|3[01])\/(0[1-9]|1[0-2])\/([0-9]{4}))'

matches = re.findall(re_pattern, result)


In [20]:
if matches:
    final_val_date = matches[0][0]
else:
    final_val_date = None


In [22]:
result


'\nFinal Valuation Date: 17/03/2025'

In [23]:
document_path


'../data_0611/OCR_output/AB5P75 - BNP 2-Year USD Bonus Certificate Worst-Of on Euro Stoxx 50 Price EUR, FTSE 100 Index and S_P 500.json'

In [21]:
final_val_date


'17/03/2025'