# **대출.AI** [UPSTAGE API MODEL FINE-TUNING]

Importing Libraries

In [57]:
import requests
import json
from llama_index.llms.upstage import Upstage
import requests
from openai import OpenAI
from dotenv import load_dotenv
from predibase import Predibase, FinetuningConfig, DeploymentConfig
import csv
import numpy as np
from numpy.random import default_rng
import os
import pandas as pd
load_dotenv()

True

Extract environment keys, set up Upstage API and Predibase

In [2]:
UPSTAGE_API_KEY = os.environ['UPSTAGE_API_KEY']
PB_API_KEY = os.environ['PB_API_KEY']
tenant_id = os.environ['TENANT_ID']
base_model = "solar-1-mini-chat-240612"
pb = Predibase(api_token=PB_API_KEY)
pb
client = OpenAI(
    api_key=UPSTAGE_API_KEY,
    base_url="https://api.upstage.ai/v1/solar"
)


### SUMMARIZE MODEL

In [3]:
def prompt_summarize(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are an expert in financial analysis. Below is a page from a financial statement document. 
                Your task is to summarize the key financial information, including all relevant numerical figures, trends, and notable observations. 
                Ensure that no important numerical data is omitted. Your summary should be clear, concise, and no longer than 200 words, focusing only on the most significant details. 
                If you find the page lacks sufficient information for a summary, you MUST return an empty response.
                """
            }, 
            {
                "role": "user",
                "content": f"""
                This is the text you are to summarize:
                {context}
                """
            }
        ],
        stream=False
    )
    response = stream.choices[0].message.content
    return response

### Data Preprocessing

In [4]:
def process_documents(filepath, url = "https://api.upstage.ai/v1/document-ai/layout-analysis", API_KEY = UPSTAGE_API_KEY):
    # Prepare request
    headers = {"Authorization": f"Bearer {API_KEY}"}
    files = {"document": open(filepath, "rb")}
    # Post request
    response = requests.post(url, headers=headers, files=files)
    # Get response JSON object
    obj = response.json()
    # Extract information
    context = ''
    for page in range(obj['billed_pages']):
        page_content = ''
        for element in obj['elements']:
            if element['page'] == page:
                if element['category'] == 'table':
                    page_content += f"\n{element['html']}"
                else:
                    page_content += f"\n{element['text']}"
        context += prompt_summarize(page_content)
    return context

In [5]:
def extract_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

In [33]:
def batch_process(pdf_urls = []):
    results = []
    pdf_files = pdf_urls
    for filename in pdf_files:
        try:
            context = process_documents(filepath=filename)
            results.append({
                "filename": os.path.basename(filename),
                "processed_data": context
            })
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    df = pd.DataFrame(results)
    return df
data_df = batch_process(extract_pdf_files("./Fine-Tuning Data"))

### Baseline Solar Model

In [7]:
def prompt_llm(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

                    1. Analyze the latest financial data provided.
                    2. Decide whether the applicant is suitable for a loan.
                    3. Provide three reasons or insights supporting your decision.

                Your insights must be backed up with financial figures.

                Your response MUST be in JSON format, following the example below:

                {
                    "stance": true,
                    "insight_1": "Example of the first insight",
                    "insight_2": "Example of the second insight",
                    "insight_3": "Example of the third insight"
                }
                """
            }, 
            {
                "role": "user",
                "content": f"""
                These are the applicant's financial statements provided:
                {context}
                """
            }
        ],
        stream=False
    )
    try:
        response = json.loads(stream.choices[0].message.content.replace("\n",""))
    except:
        return None
    return response

Testing

In [8]:
test_context = data_df.iloc[0]['processed_data']

In [9]:
response = prompt_llm(context=test_context)
response

{'stance': True,
 'insight_1': 'The company has a healthy net income of $2,000,000 and a return on equity of 20%.',
 'insight_2': 'The debt-to-equity ratio of 0.5 indicates a low level of financial risk.',
 'insight_3': "The company's total stockholders' equity of $10,000,000 demonstrates a strong financial position."}

Groundedness Check

In [10]:
def groundedness_check(context, response, client=client):
    result = client.chat.completions.create(
        model='solar-1-mini-groundedness-check',
        messages=[
            {
                'role':'user',
                'content':context
            },
            {
                'role':'assistant',
                'content':str(response)
            }
        ]
    )
    return result.choices[0].message.content == 'grounded'
grounded = groundedness_check(context=test_context, response=response)
grounded

True

### Fine-Tuning Preparation

In [34]:
def prep_finetune_data(data_list):
    final_data = []
    for data in data_list['processed_data']:
        response = str(prompt_llm(data))
        if response is not None:
            grounded = groundedness_check(data,response)
            if grounded:
                final_data.append({
                    'context': data,
                    'response': response
                })
            else:
                continue
    return final_data
data_list = prep_finetune_data(data_df)
data_list

[{'context': '\n                The provided text does not contain any financial statement information. Please provide a page from a financial statement document for accurate summary.Hanwha Systems Co., Ltd reported its Q1 performance for FY2024 on April 26, 2024. Key financial figures include:\n\n- Revenue: KRW 2.5 trillion\n- Operating profit: KRW 300 billion\n- Net profit: KRW 250 billion\n\nNotable observations:\n- Revenue increased by 10% YoY\n- Operating profit grew by 15% YoY\n- Net profit rose by 12% YoY\n\nOverall, Hanwha Systems Co., Ltd demonstrated strong financial performance in Q1 of FY2024, with significant growth in revenue, operating profit, and net profit compared to the same period last year.The document contains Hanwha Systems\' unaudited financial performance data compiled under K-IFRS. It includes both consolidated and individual performance results (provisional). The data is intended for informing shareholders and investors of the company\'s current business stat

In [35]:
def dataset_to_csv(data_list, file_name, max_entry=-1):
    rng = default_rng(seed=14)
    test_index = rng.choice(len(data_list), size=round(0.25*len(data_list)), replace=False)
    template = {
        "prompt":   """
                    <|im_start|>system
                    You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

                        1. Analyze the latest financial data provided.
                        2. Decide whether the applicant is suitable for a loan.
                        3. Provide three reasons or insights supporting your decision.

                    Your insights must be backed up with financial figures.

                    Your response MUST be in JSON format, following the example below:

                    {{
                        "stance": True,
                        "insight_1": "Example of the first insight",
                        "insight_2": "Example of the second insight",
                        "insight_3": "Example of the third insight"
                    }}

                    <|im_start|> user
                    These are the applicant's financial statements:
                    {context}
                    """,
        "completion":'{response}<|im_end|>',
        "split": "train"
    }
    with open(file_name, 'w', newline='', encoding='utf-8') as f:
        fieldnames = template.keys()
        writer = csv.DictWriter(f, fieldnames)
        writer.writeheader()
        for i, data in enumerate(data_list):
            if i >= max_entry:
                break
            if i in test_index:
                template["split"] = "test"
            row = {
                "prompt":template['prompt'].format(context=data['context']),
                "completion":template['completion'].format(response=data['response']),
                "split":template["split"] 
            }
            writer.writerow(row)
    f.close()

In [38]:
dataset_to_csv(
    data_list=data_list,
    file_name='traintestv6.csv',
    max_entry=len(data_list)
)

In [39]:
file_name = 'traintestv6.csv'
pb_dataset = pb.datasets.from_file(file_name)

In [21]:
def evaluate_qa_pairs(file_name):
    '''VALIDATION FOR VALID PROMPT, COMPLETION AND SPLIT'''
    with open(file_name, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            prompt, completion, split = row["prompt"], row["completion"], row["split"]
            assert prompt
            assert completion
            assert split
    return True
evaluate_qa_pairs('traintestv6.csv')

True

Create Predibase Repo

In [22]:
def _init_repo_(repo_name='daechul-ai-modelv4'):
    repo = pb.repos.create(name=repo_name, description="AI Evaluation Model for Daechul AI", exists_ok=True)
    print(repo)
    return repo
repo = _init_repo_()

uuid='fccfec2e-fce7-4a22-b23f-82123488685a' name='daechul-ai-modelv4' description='AI Evaluation Model for Daechul AI'


### Fine-Tuning

In [49]:
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model=base_model,
        epochs=2,
        rank=32,
        learning_rate=1e-3
    ),
    dataset=pb_dataset,
    repo=repo,
    description="Initial Daechul.AI model with defaults"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `daechul-ai-modelv4/7`. (Job UUID: cedd8934-1dfe-44a6-a343-7d2d04628016).

Watching progress of finetuning job cedd8934-1dfe-44a6-a343-7d2d04628016. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.

Job is starting. Total queue time: 0:00:45         
Waiting to receive training metrics...

┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   0.2735   [0m│        --       [0m│
│     2      [0m│   0.3051   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘


Fine-Tuned Adapter ID

In [52]:
# Get adapter ID
adapter_id = adapter.repo + "/" + str(adapter.tag)
adapter_id

'daechul-ai-modelv4/7'

Query Adapter Function

In [50]:
def query_adapter(adapter_id, context, tenant_id=tenant_id, base_model=base_model, PB_API_KEY = PB_API_KEY):
    prompt = f"""
            <|im_start|> system
            You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

            1. Analyze the latest financial data provided.
            2. Decide whether the applicant is suitable for a loan.
            3. Provide three reasons or insights supporting your decision.

            Your insights must be backed up with financial figures.

            The output must be in JSON format, following the example below STRICTLY:

            {{
                "stance": true,
                "insight_1": "Example of the first insight",
                "insight_2": "Example of the second insight",
                "insight_3": "Example of the third insight"
            }}

            <|im_start|> user
            These are the applicant's financial statements:
            {context}
    """
    # Send POST request
    url = f"https://serving.app.predibase.com/{tenant_id}/deployments/v2/llms/{base_model}/generate"
    payload = {
        "inputs": prompt,
        "parameters": {
            "adapter_id": adapter_id,
            "adapter_source": "pbase",
            "temperature": 0.1,
            "max_new_tokens": 300
        }
    }
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {PB_API_KEY}"
    }
    response = requests.post(
        url=url, 
        data=json.dumps(payload),
        headers=headers
    )
    return json.loads(json.loads(response.text)['generated_text'])
resp = query_adapter(
    adapter=adapter,
    context=test_context
)
resp

{'stance': True,
 'insight_1': "The company's net income for the fiscal year ending December 31, 2020, was $2,000,000, indicating a profitable year.",
 'insight_2': "The company's debt-to-equity ratio was 0.5, suggesting a low level of debt relative to equity.",
 'insight_3': "The company's return on equity was 20%, indicating a strong profitability for shareholders."}

### Backend LLM Function

In [53]:
import requests
from io import BytesIO
import json
import requests
from openai import OpenAI
from predibase import Predibase

load_dotenv()


UPSTAGE_API_KEY = os.environ['UPSTAGE_API_KEY']
PB_API_KEY = os.environ['PB_API_KEY']
tenant_id = os.environ['TENANT_ID']
base_model = "solar-1-mini-chat-240612"
adapter_id = 'daechul-ai-modelv4/7'
pb = Predibase(api_token=PB_API_KEY)
pb
client = OpenAI(
    api_key=UPSTAGE_API_KEY,
    base_url="https://api.upstage.ai/v1/solar"
)

def query_adapter(context, adapter_id=adapter_id, tenant_id=tenant_id, base_model=base_model, PB_API_KEY = PB_API_KEY):
    prompt = f"""
            <|im_start|> system
            You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

            1. Analyze the latest financial data provided.
            2. Decide whether the applicant is suitable for a loan.
            3. Provide three reasons or insights supporting your decision.

            Your insights must be backed up with financial figures.

            The output must be in JSON format, following the example below STRICTLY:

            {{
                "stance": true,
                "insight_1": "Example of the first insight",
                "insight_2": "Example of the second insight",
                "insight_3": "Example of the third insight"
            }}

            <|im_start|> user
            These are the applicant's financial statements:
            {context}
    """
    # Send POST request
    url = f"https://serving.app.predibase.com/{tenant_id}/deployments/v2/llms/{base_model}/generate"
    payload = {
        "inputs": prompt,
        "parameters": {
            "adapter_id": adapter_id,
            "adapter_source": "pbase",
            "temperature": 0.1,
            "max_new_tokens": 300
        }
    }
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {PB_API_KEY}"
    }
    response = requests.post(
        url=url, 
        data=json.dumps(payload),
        headers=headers
    )
    return json.loads(json.loads(response.text)['generated_text'])

def prompt_llm(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

                    1. Analyze the latest financial data provided.
                    2. Decide whether the applicant is suitable for a loan.
                    3. Provide three reasons or insights supporting your decision.

                Your insights must be backed up with financial figures. Be as critical as possible

                Your response MUST be in JSON format, following the example below:

                {
                    "stance": true,
                    "insight_1": "Example of the first insight",
                    "insight_2": "Example of the second insight",
                    "insight_3": "Example of the third insight"
                }
                """
            }, 
            {
                "role": "user",
                "content": f"""
                These are the applicant's financial statements provided:
                {context}
                """
            }
        ],
        stream=False
    )
    try:
        response = json.loads(stream.choices[0].message.content.replace("\n",""))
    except:
        return None
    return response

def prompt_summarize(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are an expert in financial analysis. Below is a page from a financial statement document. 
                Your task is to summarize the key financial information, including all relevant numerical figures, trends, and notable observations. 
                Ensure that no important numerical data is omitted. Your summary should be clear, concise, and no longer than 200 words, focusing only on the most significant details. 
                If you find the page lacks sufficient information for a summary, you MUST return an empty response.
                """
            }, 
            {
                "role": "user",
                "content": f"""
                This is the text you are to summarize:
                {context}
                """
            }
        ],
        stream=False
    )
    response = stream.choices[0].message.content
    return response

def loan_evaluation(file_url,url="https://api.upstage.ai/v1/document-ai/layout-analysis", API_KEY=UPSTAGE_API_KEY):
    response = requests.get(file_url)
    response.raise_for_status()  

    # Load the file into a BytesIO object
    file_data = BytesIO(response.content)

    headers = {"Authorization": f"Bearer {API_KEY}"}
    files = {"document": file_data}

    # Post request to the API
    api_response = requests.post(url, headers=headers, files=files)
    api_response.raise_for_status()  
    
    obj = api_response.json()

    # Extract information
    context = ''
    for page in range(obj['billed_pages']):
        page_content = ''
        for element in obj['elements']:
            if element['page'] == page:
                if element['category'] == 'table':
                    page_content += f"\n{element['html']}"
                else:
                    page_content += f"\n{element['text']}"
        context += prompt_summarize(page_content)

    loan_results=query_adapter(context)
    
    return loan_results

Example Usage

In [55]:
loan_results=loan_evaluation('https://utfs.io/f/2d94c9cb-82b5-4a15-b6fe-aa1f42820a91-aakpmq.pdf')

In [56]:
print(loan_results)

{'stance': True, 'insight_1': 'The company has a strong balance sheet with a substantial equity base of $30,000,000, which indicates financial stability and a lower risk of default on loans.', 'insight_2': "The company's revenue growth of $40,000,000 and net income margin of 50% ($20,000,000 net income) demonstrate profitability and potential for loan repayment.", 'insight_3': "The plant-based meat market is experiencing rapid growth, with a projected CAGR of 19.4% from 2023 to 2030, indicating a promising market for FFC LLC's products and potential for loan repayment."}
