In [24]:
import langchain.prompts as prompts
from langchain.schema import SystemMessage
from langchain.prompts.chat import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback
import os

import io
import requests
from PyPDF2 import PdfReader

from langchain.vectorstores import Pinecone
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

In [25]:
os.environ['OPENAI_API_KEY'] = '<your API key>'

### Prompts template 

In [31]:
qa_prompt = prompts.PromptTemplate(
    input_variables=["question", "context_str", "length"],
    template="""Write an answer ({length}) 
    for the question below solely based on the provided context. 
    If the context provides insufficient information,
    reply 'I cannot answer'. 
    For each sentence in your answer, indicate which sources most support it
    via valid citation markers at the end of sentences, like (Example2012).
    Answer in an unbiased and scholarly tone. Make clear what is your opinion.
    Use Markdown for formatting code or text, and try to use direct quotes to support arguments.\n\n
    {context_str}\n
    Question: {question}\n
    Answer: """,
)

def make_chain(prompt, llm):
    if type(llm) == ChatOpenAI:
        system_message_prompt = SystemMessage(
            content="""You are a scholarly researcher that answers in an unbiased, scholarly tone.
            You sometimes refuse to answer if there is insufficient information.""",
        )
        human_message_prompt = HumanMessagePromptTemplate(prompt=prompt)
        prompt = ChatPromptTemplate.from_messages(
            [system_message_prompt, human_message_prompt]
        )
    return LLMChain(prompt=prompt, llm=llm)

### Generate data 

In [32]:
import csv
import random
import string


# function to generate random strings of length n
def generate_random_string(n):
    return ''.join(random.choices(string.ascii_letters, k=n))


# function to generate random integer between a and b
def generate_random_integer(a, b):
    return random.randint(a, b)


# function to generate random decimal between a and b
def generate_random_decimal(a, b):
    return round(random.uniform(a, b), 2)


# list of industries
industries = ['Technology', 'Healthcare', 'Finance', 'Manufacturing', 'Retail', 'Construction', 'Transportation']

# list of business rationales
business_rationales = ['Expansion', 'Inventory', 'Equipment', 'Marketing', 'Working Capital']

# number of loan applications to generate
k = 100

# generate loan application data and write to CSV file
with open('loan_applications.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Applicant Name', 'Applicant Age', 'Company Name', 'Revenue', 'Profit', 'Industry',
                     'Employee Count', 'Year Founded', 'Assets', 'Liability', 'Liquid Cash',
                     'Number of Key Suppliers', 'Customer Count', 'Business Rationale', 'Loan Amount',
                     'Loan Period in Years'])
    for i in range(k):
        applicant_name = generate_random_string(10)
        applicant_age = generate_random_integer(18, 75)
        company_name = generate_random_string(10) + ' ' + generate_random_string(5) + ' ' + 'Ltd.'
        revenue = generate_random_decimal(100000, 10000000)
        profit = generate_random_decimal(10000, 5000000)
        industry = random.choice(industries)
        employee_count = generate_random_integer(20, 100)
        year_founded = generate_random_integer(1900, 2023)
        assets = generate_random_decimal(100000, 10000000)
        liability = generate_random_decimal(10000, 1000000)
        liquid_cash = generate_random_decimal(10000, 1000000)
        num_key_suppliers = generate_random_integer(1, 20)
        customer_count = generate_random_integer(100, 10000)
        business_rationale = random.choice(business_rationales)
        loan_amount = generate_random_integer(10000, 500000)
        loan_period = generate_random_integer(1, 25)
        writer.writerow([applicant_name, applicant_age, company_name, revenue, profit, industry,
                         employee_count, year_founded, assets, liability, liquid_cash,
                         num_key_suppliers, customer_count, business_rationale, loan_amount,
                         loan_period])

In [33]:
import pandas as pd
data = pd.read_csv('loan_applications.csv')

In [34]:
data 

Unnamed: 0,Applicant Name,Applicant Age,Company Name,Revenue,Profit,Industry,Employee Count,Year Founded,Assets,Liability,Liquid Cash,Number of Key Suppliers,Customer Count,Business Rationale,Loan Amount,Loan Period in Years
0,YTrzcTxqzf,66,pQYCBGYHyL soIni Ltd.,6615585.67,1265747.43,Manufacturing,44,1923,1456600.89,978683.97,622218.49,8,7306,Marketing,90593,23
1,hAEmtDVQna,63,owRXPIORYt rdlHx Ltd.,6700688.32,2306607.58,Manufacturing,38,1905,8760078.11,303093.78,441820.37,6,5652,Marketing,30732,6
2,EvPJYhlyAl,20,chFjbYVFWL XOfVz Ltd.,1446104.48,1234785.57,Transportation,50,1941,6854715.59,421555.43,644272.08,2,4194,Expansion,166386,16
3,andtaMTUcQ,23,VNeAKikJOJ WRRbx Ltd.,5947988.11,3611727.37,Retail,97,1978,7759492.31,462657.48,12971.29,20,5504,Working Capital,462901,20
4,PNaAiNxHPg,41,SWQDsWDOmn QgGqs Ltd.,3264260.54,1358341.36,Construction,61,1974,3071638.21,53163.21,76979.20,17,4329,Equipment,137337,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,VAKmgzxVLR,21,ZHFQbnrCBy JFqLw Ltd.,8873410.93,4225820.23,Technology,61,1923,406395.63,37800.43,568034.76,3,5569,Inventory,353916,24
96,anQBJQUXSX,55,QPUDTIWggf emWNu Ltd.,1463128.59,4215677.47,Retail,63,2009,1816302.00,581756.07,261506.20,3,2696,Marketing,58944,9
97,EiawkzetJh,41,rlTzzPYbwj pJzdo Ltd.,1798994.86,236037.32,Manufacturing,85,2003,9383912.23,942213.06,55279.62,11,9172,Equipment,130071,18
98,ZSDtlpFOVm,19,wkpPHhuSSl iqEoM Ltd.,1726320.09,3360065.26,Retail,97,1989,3818744.63,671214.39,705426.23,19,3940,Expansion,417742,24


### Modelling 

In [24]:
llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")
qa_chain = make_chain(prompt=qa_prompt, llm=llm)

In [25]:
query = ''
docs = 'embeddings to be parse to the model'
tokens = 0 
with get_openai_callback() as cb:
    answer_text = qa_chain.run(
        question=query, context_str= docs, length=50
    )
    tokens += cb.total_tokens

In [None]:
answer_text

In [27]:
tokens

1126