In [1]:
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

In [2]:
# bring in deps
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

In [3]:
import os
import nest_asyncio

# Allow async
nest_asyncio.apply()

In [4]:
# Set file path
FILE_PATH = "/Users/syshin/Desktop/Syshin/LabQ/한국자동차연구원/data/논문/KSAE 학술대회 논문 모음집/2020년/2. 추계/Data/20AKSAE_A004.pdf"

## Default Parsing

In [5]:
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

# Configure the LlamaParse instance
parser = LlamaParse(
    result_type="markdown",  # Output format ("text", "markdown", "json", or "structured")
    num_workers=8,
    verbose=True,
    language="en",
    show_progress=True,
)

# Define a file extractor mapping file extensions to parsers
file_extractor = {".pdf": parser}

# Use SimpleDirectoryReader to parse the specified PDF file
documents = SimpleDirectoryReader(
    input_files=[FILE_PATH],  # List of files to process
    file_extractor=file_extractor,
).load_data()

Started parsing the file under job_id ad24523a-c12b-49c3-b4fb-45fbeea4d932
.

In [6]:
documents

[Document(id_='0aa40f4c-fdb9-4f4d-bbb9-5871f404b5ba', embedding=None, metadata={'file_path': '/Users/syshin/Desktop/Syshin/LabQ/한국자동차연구원/data/논문/KSAE 학술대회 논문 모음집/2020년/2. 추계/Data/20AKSAE_A004.pdf', 'file_name': '20AKSAE_A004.pdf', 'file_type': 'application/pdf', 'file_size': 1309200, 'creation_date': '2025-01-14', 'last_modified_date': '2020-11-11'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='# 2020 한국자동차공학회 추계학술대회\n\n# 회귀분석법을 이용한 차량 오일펌프의 인로터 중량 최적화 해석\n\n정대근1)* 김기태1) 명화공업1)\n\nIn-rotor weight optimization analysis of vehicle oil pump using regression analysis method\n\nDae‑geun Jung1) Ki‑Tae Kim1)

In [7]:
# Convert LlamaIndex documents to LangChain document format
docs = [doc.to_langchain_format() for doc in documents]

In [8]:
# Display the content of a specific document (e.g., the 6th document)
print(docs[3].page_content)

|No.|y|No|y|No|y|
|---|---|---|---|---|---|
|1|113.1|10|92.0|19|74.7|
|2|55.8|11|47.7|20|42.5|
|3|33.5|12|29.7|21|26.3|
|4|70.2|13|58.7|22|48.5|
|5|39.8|14|35.5|23|29.5|
|6|49.8|15|43.3|24|38.3|
|7|52.5|16|42.1|25|36.7|
|8|57.6|17|50.1|26|43.0|
|9|35.0|18|29.6|27|26.9|

형은 다음 식(2),(3)와 같다.

𝑦௜ = 𝛽଴ + 𝛽𝑥ଵ௜ + … + 𝛽𝑥ହ௜ + 𝜀 (𝑖 = 1,5)

𝛽, 𝛽,… , 𝛽ହ 미지수

위 중회귀 모형을 행렬식으로 표현하면 다음 식

y = 𝑋𝛽 + 𝜀 → 𝜀 = 𝑦 − 𝑋𝛽

𝐲 = ⎨𝑦ଶ⎧𝑦ଵ⎫ ⎧1 𝑥ଵଵ … 𝑥हଵ⎫

⎪1 𝑥ଵଶ … 𝑥हଶ⎪

. .

⎬𝐗 = ⎨ . . … . ⎬

⎪1 𝑥ଵह . … . ⎪

⎩𝑦ह ⎭ ⎩ ⋯ 𝑥हह ⎪⎭

𝛃 = ⎨𝛽଴⎫ ⎪𝛽⎪ ⎧𝜀ଵ⫽𝜀ଶ

⎩𝛽⎭ 𝛆 = ⎨𝜀ହ ⎭⎩ .

위 식에 오차항의 제곱을 한 최종 식은 다음 식

S = ∑𝑛𝑖=1 𝜀௜ଶ = 𝜀ଵଶ + 𝜀ଶଶ + ⋯ + 𝜀ହଶ = 𝜀

= (y − Xβ)′(y − Xβ)

= (y′ − β′X′)(y − Xβ)

# 3. 다중 회귀 분석 이론

회귀 분석은 X라는 독립변수와 Y라는 종속변수 간의 인과 관계를 찾기 위한 기업으로 종속변인이 얼마나 변할 것인지에 대한 영향력을 예측하는 통계 분석 기법이다.

회귀 분석의 종류는 단순 선형 회귀분석, 다중 회귀분석(중회귀분석), 위계적 다중 회귀 분석, 곡선 회귀분석 등 다양하게 있다.

본 논문에서는 독립변수가 하나 이상의 설계 파


## MultiModal-Parsing

In [9]:
# Configure the LlamaParse instance to use the vendor multimodal model
multimodal_parser = LlamaParse(
    use_vendor_multimodal_model=True,
    # vendor_multimodal_model_name="openai-gpt4o",
    # vendor_multimodal_api_key=os.environ["OPENAI_API_KEY"],
    result_type="markdown",
    language="ko",
)

In [10]:
# Parse the PDF file using the multimodal parser
multimodal_parsed_docs = multimodal_parser.load_data(file_path=FILE_PATH)

Started parsing the file under job_id 30ee2812-b770-44a8-b88a-b033d6c99b38
.

In [11]:
# Convert to langchain document format
multimodal_docs = [doc.to_langchain_format() for doc in multimodal_parsed_docs]

In [12]:
len(multimodal_docs)

6

In [13]:
# Display the content of the first document
print(multimodal_docs[0].page_content)

# 회귀분석법을 이용한 차량 오일펌프의 인로터 중량 최적화 해석

정대근¹) 김기태¹)  
명화공업¹)

## In-rotor weight optimization analysis of vehicle oil pump using regression analysis method

Dae-geun Jung¹) Ki-Tae Kim¹)

¹) MYUNGHWA IND. CO., LTD. R&D Center 65, Beoji-gil-ro, Danwon-gu, Ansan-si, Gyeonggi-do, Korea

### Abstract

This white paper contains information on techniques that can reduce design time. In this study, multiple regression analysis was used as a method to increase work efficiency. The vane, in-rotor of the vehicle oil pump was selected. Variables that can be optimized for weight were selected during design. It has 5 independent variables and 3 factors, which was analyzed by multiple regression. Based on this data, the "Macro" automation sheet was built. As a result of comparing the optimization program and CAE in this paper, the reliability was 91.6%. Through this, the designer can easily and quickly predict design variables without CAE analysis.

### Key words

Multiple regression analysis(다중회귀분석), V

## Custom Parsing Instructions

In [7]:
# Configure parsing instruction
parsing_instruction = """You are extracting information from a Korean research paper. The page layout is left-to-right, top-to-bottom. The document contains the following elements:
1. Table Extraction
	•	Detect and extract tabular data. In this case:
	•	Headers: Include column names like No., Y, y values.
	•	Data Rows: Parse rows with numerical values for each column.
	•	Maintain the structure of the table in markdown format.

2. Mathematical Equations
	•	Identify equations (usually marked with equation numbers like (2), (3), etc.).
	•	Extract equations as text in LaTeX format for proper representation.

3. Figures and Annotations
	•	Extract images or diagrams (e.g., Fig. 4: 인코터 CAE 조건).
	•	Include associated captions in text format.
	•	Save the image separately and reference it in the output metadata.

4. Section Headers
	•	Extract and distinguish section headers with hierarchy (e.g., 3. 다중 회귀 분석 이론).
	•	Mark the headers as:
	•	Level 1: Main section headers.
	•	Level 2: Subsection headers (if any).

5. Text Body
	•	Extract body text associated with each section.
	•	Preserve formatting, such as bold or italicized text.
	•	Group text under the corresponding section header.

6. Symbols and Variables
	•	Parse variables and symbols used in equations or explanations (e.g.,  y_i, \\beta_0, \\beta_1, \\ldots ).
	•	Maintain a list of variables with their context or definitions if provided.

7. Pagination
	•	Include the page number for reference (e.g., Page 4).

8. Footnotes or References
	•	If present, extract footnotes or additional references as a separate section.
 """

In [None]:
# LlamaParse configuration
instruction_parser = LlamaParse(
    use_vendor_multimodal_model=True,
    # vendor_multimodal_model_name="openai-gpt4o",
    # vendor_multimodal_api_key=os.environ["OPENAI_API_KEY"],
    result_type="markdown",
    language="ko",
    parsing_instruction=parsing_instruction,
    # extract_layout=True,
)

# Parse pdf file
instruction_parsed_docs = instruction_parser.load_data(file_path=FILE_PATH)

Started parsing the file under job_id 01c883ea-8e94-42ef-89c1-19ac704fdb76


In [16]:
instruction_parsed_docs

[Document(id_='0d57edf6-eb11-4e74-9868-05b1dbfe4348', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='```json\n{\n  "metadata": {\n    "title": "In-rotor weight optimization analysis of vehicle oil pump using regression analysis method",\n    "authors": ["Dae-geun Jung", "Ki-Tae Kim"],\n    "affiliation": "MYUNGHWA IND. CO., LTD. R&D Center 65, Beoji-gil-ro, Danwon-gu, Ansan-si, Gyeonggi-do, Korea",\n    "emails": ["dgjung@myunghwa.com", "ktkim1@myunghwa.com"],\n    "page": 1\n  },\n  "abstract": "This white paper contains information on techniques that can reduce design time. In this study, multiple regression analysis was used as a method to increase work efficiency. The vane, in-rotor of the vehicle oil pump was selected. Variables that can be optimized for weight were selected during design. It has 

In [17]:
# Convert to langchain documents
instruction_docs = [doc.to_langchain_format() for doc in instruction_parsed_docs]

In [18]:
print(instruction_docs[3].page_content)

```json
{
  "page": 4,
  "table": {
    "title": "Table 3 CAE 주응력 해석 결과",
    "headers": ["No.", "y", "No", "y"],
    "rows": [
      {"No.": 1, "y": 113.1, "No": 10, "y": 92.0},
      {"No.": 19, "y": 74.7, "No": 2, "y": 55.8},
      {"No.": 11, "y": 47.7, "No": 20, "y": 42.5},
      {"No.": 3, "y": 33.5, "No": 12, "y": 29.7},
      {"No.": 21, "y": 26.3, "No": 4, "y": 70.2},
      {"No.": 13, "y": 58.7, "No": 22, "y": 48.5},
      {"No.": 5, "y": 39.8, "No": 14, "y": 35.5},
      {"No.": 23, "y": 29.5, "No": 6, "y": 49.8},
      {"No.": 15, "y": 43.5, "No": 24, "y": 38.3},
      {"No.": 7, "y": 49.5, "No": 16, "y": 42.1},
      {"No.": 25, "y": 36.7, "No": 8, "y": 57.6},
      {"No.": 17, "y": 50.1, "No": 26, "y": 43.0},
      {"No.": 9, "y": 35.0, "No": 18, "y": 29.6},
      {"No.": 27, "y": 26.9}
    ]
  },
  "figures": [
    {
      "id": "Fig. 4",
      "caption": "인코터 CAE 조건",
      "image": "Fig_4_인코터_CAE_조건.png"
    }
  ],
  "section_headers": [
    {
      "level": 1,
      "