# **대출.AI** [UPSTAGE API MODEL FINE-TUNING]

Importing Libraries

In [2]:
import requests
import json
from llama_index.llms.upstage import Upstage
import requests
from openai import OpenAI
from dotenv import load_dotenv
from predibase import Predibase, FinetuningConfig, DeploymentConfig
import csv
import numpy as np
from numpy.random import default_rng
import os
import pandas as pd
import glob
load_dotenv()

False

Extract environment keys, set up Upstage API and Predibase

In [5]:
UPSTAGE_API_KEY = os.environ['UPSTAGE_API_KEY']
PB_API_KEY = os.environ['PB_API_KEY']
tenant_id = os.environ['TENANT_ID']
base_model = "solar-1-mini-chat-240612"
pb = Predibase(api_token=PB_API_KEY)
pb
client = OpenAI(
    api_key=UPSTAGE_API_KEY,
    base_url="https://api.upstage.ai/v1/solar"
)


Summarize Model

In [23]:
def prompt_summarize(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are an expert in financial analysis. Below is a page from a financial statement document. 
                Your task is to summarize the key financial information, including all relevant numerical figures, trends, and notable observations. 
                Ensure that no important numerical data is omitted. Your summary should be clear, concise, and no longer than 200 words, focusing only on the most significant details. 
                If you find the page lacks sufficient information for a summary, you MUST return an empty response.
                """
            }, 
            {
                "role": "user",
                "content": f"""
                This is the text you are to summarize:
                {context}
                """
            }
        ],
        stream=False
    )
    response = stream.choices[0].message.content
    return response

Data Preprocessing

In [24]:
def process_documents(filepath, url = "https://api.upstage.ai/v1/document-ai/layout-analysis", API_KEY = UPSTAGE_API_KEY):
    # Prepare request
    headers = {"Authorization": f"Bearer {API_KEY}"}
    files = {"document": open(filepath, "rb")}
    # Post request
    response = requests.post(url, headers=headers, files=files)
    # Get response JSON object
    obj = response.json()
    # Extract information
    context = ''
    for page in range(obj['billed_pages']):
        page_content = ''
        for element in obj['elements']:
            if element['page'] == page:
                if element['category'] == 'table':
                    page_content += f"\n{element['html']}"
                else:
                    page_content += f"\n{element['text']}"
        context += prompt_summarize(page_content)
    return context

In [25]:
def extract_pdf_files(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

In [26]:
def batch_process(pdf_urls = []):
    results = []
    pdf_files = pdf_urls
    for filename in pdf_files:
        try:
            context = process_documents(filepath=filename)
            results.append({
                "filename": os.path.basename(filename),
                "processed_data": context
            })
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    df = pd.DataFrame(results)
    return df
data_df = batch_process(extract_pdf_files("./"))

Baseline Solar Model

In [27]:
def prompt_llm(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

                    1. Analyze the latest financial data provided.
                    2. Decide whether the applicant is suitable for a loan.
                    3. Provide three reasons or insights supporting your decision.

                Your insights must be backed up with financial figures.

                Your response MUST be in JSON format, following the example below:

                {
                    "stance": true,
                    "insight_1": "Example of the first insight",
                    "insight_2": "Example of the second insight",
                    "insight_3": "Example of the third insight"
                }
                """
            }, 
            {
                "role": "user",
                "content": f"""
                These are the applicant's financial statements provided:
                {context}
                """
            }
        ],
        stream=False
    )
    try:
        response = json.loads(stream.choices[0].message.content.replace("\n",""))
    except:
        return None
    return response

Testing

In [28]:
test_context = data_df.iloc[0]['processed_data']

In [29]:
response = prompt_llm(context=test_context)
response

{'stance': True,
 'insight_1': 'The company experienced a year-over-year increase in both revenue and operating profit, driven by growth in the defense sector.',
 'insight_2': 'The operating profit margin improved due to revenue growth and cost control measures.',
 'insight_3': 'The figures include the effect of exit business, which may impact the accuracy of the financial analysis.'}

Groundedness Check

In [30]:
def groundedness_check(context, response, client=client):
    result = client.chat.completions.create(
        model='solar-1-mini-groundedness-check',
        messages=[
            {
                'role':'user',
                'content':context
            },
            {
                'role':'assistant',
                'content':str(response)
            }
        ]
    )
    return result.choices[0].message.content == 'grounded'
grounded = groundedness_check(context=test_context, response=response)
grounded

True

Fine-tune

In [31]:
def prep_finetune_data(data_list):
    final_data = []
    for data in data_list['processed_data']:
        response = str(prompt_llm(data))
        if response is not None:
            grounded = groundedness_check(data,response)
            if grounded:
                final_data.append({
                    'context': data,
                    'response': response
                })
            else:
                continue
    return final_data
data_list = prep_finetune_data(data_df)
data_list

[{'context': "\n                The provided text does not contain any financial information or data. Therefore, I am unable to summarize any key financial details.\n                Hanwha Systems Co., Ltd reported its FY2024 Q1 Performance Report on 2024.04.26. The report highlights a 10% YoY increase in revenue to KRW 1.5 trillion. Net profit also increased by 15% YoY to KRW 100 billion. The company's operating profit grew by 12% YoY to KRW 200 billion. Notably, the company's defense systems segment contributed significantly to the growth, with a 20% YoY increase in revenue. Additionally, the company announced plans to invest in research and development to strengthen its position in the aerospace industry.The document contains Hanwha Systems' unaudited financial performance data in accordance with K-IFRS. It includes consolidated and individual performance results, and is intended to inform shareholders and investors about the company's current business status. The financial data may

In [32]:
def dataset_to_csv(data_list, file_name, max_entry=-1):
    rng = default_rng(seed=14)
    test_index = rng.choice(len(data_list), size=round(0.25*len(data_list)), replace=False)
    template = {
        "prompt":   """
                    <|im_start|>system
                    You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

                        1. Analyze the latest financial data provided.
                        2. Decide whether the applicant is suitable for a loan.
                        3. Provide three reasons or insights supporting your decision.

                    Your insights must be backed up with financial figures.

                    Your response MUST be in JSON format, following the example below:

                    {{
                        "stance": True,
                        "insight_1": "Example of the first insight",
                        "insight_2": "Example of the second insight",
                        "insight_3": "Example of the third insight"
                    }}

                    <|im_start|> user
                    These are the applicant's financial statements:
                    {context}
                    """,
        "completion":'{response}<|im_end|>',
        "split": "train"
    }
    with open(file_name, 'w', newline='', encoding='utf-8') as f:
        fieldnames = template.keys()
        writer = csv.DictWriter(f, fieldnames)
        writer.writeheader()
        for i, data in enumerate(data_list):
            if i >= max_entry:
                break
            if i in test_index:
                template["split"] = "test"
            row = {
                "prompt":template['prompt'].format(context=data['context']),
                "completion":template['completion'].format(response=data['response']),
                "split":template["split"] 
            }
            writer.writerow(row)
    f.close()

In [36]:
dataset_to_csv(
    data_list=data_list,
    file_name='traintestv4.csv',
    max_entry=len(data_list)
)

In [38]:
file_name = 'traintestv4.csv'
pb_dataset = pb.datasets.from_file(file_name)

In [None]:
def evaluate_qa_pairs(file_name):
    '''VALIDATION FOR VALID PROMPT, COMPLETION AND SPLIT'''
    with open(file_name, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            prompt, completion, split = row["prompt"], row["completion"], row["split"]
            assert prompt
            assert completion
            assert split
    return True
evaluate_qa_pairs('traintestv2.csv')

True

CREATE REPO

In [39]:
def _init_repo_(repo_name='daechul-ai-modelv4'):
    repo = pb.repos.create(name=repo_name, description="AI Evaluation Model for Daechul AI", exists_ok=True)
    print(repo)
    return repo
repo = _init_repo_()

uuid='fccfec2e-fce7-4a22-b23f-82123488685a' name='daechul-ai-modelv4' description='AI Evaluation Model for Daechul AI'


In [40]:
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model=base_model,
        epochs=2,
        rank=16,
        learning_rate=1e-3
    ),
    dataset=pb_dataset,
    repo=repo,
    description="Initial Daechul.AI model with defaults"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `daechul-ai-modelv4/1`. (Job UUID: 3caf0f8c-20c2-4ea1-89f0-5571c45391f5).

Watching progress of finetuning job 3caf0f8c-20c2-4ea1-89f0-5571c45391f5. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.

Job is starting. Total queue time: 0:00:46         
Waiting to receive training metrics...

┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   0.2018   [0m│        --       [0m│
│     2      [0m│   0.2018   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘


In [46]:
def query_adapter(adapter, context, tenant_id=tenant_id, base_model=base_model, PB_API_KEY = PB_API_KEY):
    prompt = f"""
            <|im_start|> system
            You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

            1. Analyze the latest financial data provided.
            2. Decide whether the applicant is suitable for a loan.
            3. Provide three reasons or insights supporting your decision.

            Your insights must be backed up with financial figures.

            The output must be in JSON format, following the example below:

            {{
                "stance": true,
                "insight_1": "Example of the first insight",
                "insight_2": "Example of the second insight",
                "insight_3": "Example of the third insight"
            }}

            <|im_start|> user
            These are the applicant's financial statements:
            {context}
"""
    # Get adapter ID
    adapter_id = adapter.repo + "/" + str(adapter.tag)
    # Send POST request
    url = f"https://serving.app.predibase.com/{tenant_id}/deployments/v2/llms/{base_model}/generate"
    payload = {
        "inputs": prompt,
        "parameters": {
            "adapter_id": adapter_id,
            "adapter_source": "pbase",
            "temperature": 0.1,
            "max_new_tokens": 300
        }
    }
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {PB_API_KEY}"
    }
    response = requests.post(
        url=url, 
        data=json.dumps(payload),
        headers=headers
    )
    return response
# resp = query_adapter(
#     adapter=adapter,
#     context=test_context
# )

### FUNCTION FOR BACKEND

In [56]:
import requests
from io import BytesIO
import json
import requests
from openai import OpenAI
from predibase import Predibase

load_dotenv()


UPSTAGE_API_KEY = os.environ['UPSTAGE_API_KEY']
PB_API_KEY = os.environ['PB_API_KEY']
tenant_id = os.environ['TENANT_ID']
base_model = "solar-1-mini-chat-240612"
pb = Predibase(api_token=PB_API_KEY)
pb
client = OpenAI(
    api_key=UPSTAGE_API_KEY,
    base_url="https://api.upstage.ai/v1/solar"
)





def prompt_llm(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are a bank loan assistant tasked with determining the suitability of a loan applicant based on their provided financial statements. You must:

                    1. Analyze the latest financial data provided.
                    2. Decide whether the applicant is suitable for a loan.
                    3. Provide three reasons or insights supporting your decision.

                Your insights must be backed up with financial figures. Be as critical as possible

                Your response MUST be in JSON format, following the example below:

                {
                    "stance": true,
                    "insight_1": "Example of the first insight",
                    "insight_2": "Example of the second insight",
                    "insight_3": "Example of the third insight"
                }
                """
            }, 
            {
                "role": "user",
                "content": f"""
                These are the applicant's financial statements provided:
                {context}
                """
            }
        ],
        stream=False
    )
    try:
        response = json.loads(stream.choices[0].message.content.replace("\n",""))
    except:
        return None
    return response

def prompt_summarize(context, client=client):
    stream = client.chat.completions.create(
        model='solar-1-mini-chat',
        messages=[
            {
                "role": "system",
                "content": """
                You are an expert in financial analysis. Below is a page from a financial statement document. 
                Your task is to summarize the key financial information, including all relevant numerical figures, trends, and notable observations. 
                Ensure that no important numerical data is omitted. Your summary should be clear, concise, and no longer than 200 words, focusing only on the most significant details. 
                If you find the page lacks sufficient information for a summary, you MUST return an empty response.
                """
            }, 
            {
                "role": "user",
                "content": f"""
                This is the text you are to summarize:
                {context}
                """
            }
        ],
        stream=False
    )
    response = stream.choices[0].message.content
    return response

def loan_evaluation(file_url,url="https://api.upstage.ai/v1/document-ai/layout-analysis", API_KEY=UPSTAGE_API_KEY):
    response = requests.get(file_url)
    response.raise_for_status()  

    # Load the file into a BytesIO object
    file_data = BytesIO(response.content)

    headers = {"Authorization": f"Bearer {API_KEY}"}
    files = {"document": file_data}

    # Post request to the API
    api_response = requests.post(url, headers=headers, files=files)
    api_response.raise_for_status()  
    
    obj = api_response.json()

    # Extract information
    context = ''
    for page in range(obj['billed_pages']):
        page_content = ''
        for element in obj['elements']:
            if element['page'] == page:
                if element['category'] == 'table':
                    page_content += f"\n{element['html']}"
                else:
                    page_content += f"\n{element['text']}"
        context += prompt_summarize(page_content)

    loan_results=prompt_llm(context)
    
    return loan_results






### EXAMPLE USAGE

In [57]:
loan_results=loan_evaluation('https://utfs.io/f/2d94c9cb-82b5-4a15-b6fe-aa1f42820a91-aakpmq.pdf',adapter=adapter)

In [59]:
print(loan_results)

{'stance': True, 'insight_1': "The company's target market, which includes health-conscious millennials and Gen Z, vegetarians, vegans, flexitarians, and curious meat-eaters, aligns with current consumer trends and shows potential for strong demand.", 'insight_2': 'The competitive landscape includes traditional fried chicken chains and other plant-based meat companies, but FFC LLC differentiates itself with its proprietary recipe and commitment to sustainability, animal welfare, health, and customer experience.', 'insight_3': "The projected CAGR of 19.4% for the plant-based meat market from 2023 to 2030 indicates a favorable industry growth trend, supporting the potential success of FFC LLC's business model."}
