# RAG 어플리케이션을 위한 PDF 파싱

# 데이터 전처리
- pdfminer로 텍스트 추출
- GPT-4V를 이용해 PDF를 이미지로 변환 후 추출

# 1. Setup


In [None]:
%pip install pdf2image
%pip install pdfminer
%pip install openai
%pip install scikit-learn
%pip install rich
%pip install tqdm
%pip install concurrent

In [None]:
!pip install --upgrade pdfminer.six

In [None]:
# Imports
from pdf2image import convert_from_path
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)
from pdfminer.high_level import extract_text
import base64
from io import BytesIO
import os
import concurrent
from tqdm import tqdm
from openai import OpenAI
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
from rich import print
from ast import literal_eval

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 파일 프로세싱


In [None]:
def convert_doc_to_images(path):
    images = convert_from_path(path)
    return images

def extract_text_from_doc(path):
    text = extract_text(path)
    page_text = []
    return text

# example 테스트

In [None]:
# pdf를 이미지로 변환하는 라이브러리
!apt-get install poppler-utils

In [None]:
# file_path = "/content/drive/MyDrive/gdrive/data/example_pdfs/fine-tuning-deck.pdf"
file_path = "/content/drive/MyDrive/example_pdfs/fine-tuning-deck.pdf"

images = convert_doc_to_images(file_path)

In [None]:
text = extract_text_from_doc(file_path)

In [None]:
for img in images:
  display(img)

# GPT-4V를 이용한 이미지 분석

In [None]:
client = OpenAI(api_key="")

In [None]:
def get_img_uri(img):
    buffer = BytesIO()
    img.save(buffer, format="jpeg")
    base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
    data_uri = f"data:image/jpeg;base64,{base64_image}"
    return data_uri

In [None]:
system_prompt = '''
You will be provided with an image of a pdf page or a slide. Your goal is to talk about the content that you see, in technical terms, as if you were delivering a presentation.

If there are diagrams, describe the diagrams and explain their meaning.
For example: if there is a diagram describing a process flow, say something like "the process flow starts with X then we have Y and Z..."

If there are tables, describe logically the content in the tables
For example: if there is a table listing items and prices, say something like "the prices are the following: A for X, B for Y..."

DO NOT include terms referring to the content format
DO NOT mention the content type - DO focus on the content itself
For example: if there is a diagram/chart and text on the image, talk about both without mentioning that one is a chart and the other is text.
Simply describe what you see in the diagram and what you understand from the text.

You should keep it concise, but keep in mind your audience cannot see the image so be exhaustive in describing the content.

Exclude elements that are not relevant to the content:
DO NOT mention page numbers or the position of the elements on the image.

------

If there is an identifiable title, identify the title to give the output in the following format:

{TITLE}

{Content description}

If there is no clear title, simply return the content description.

'''

def analyze_image(img_url):
  response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
      {
          "role": "user",
          "content": [
              {"type": "text", "text": "What’s in this image?"},
              {"type": "image_url",
               "image_url": {
                   "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
                   },
               },
              ],
          }
      ],
  max_tokens=300,
  )
  return response.choices[0].message.content
"""
    response = client.chat.completions.create(
    #model="gpt-4-vision-preview",
    model="gpt-4o",
    temperature=0,
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": img_url,
                },
            ],
        }
    ],
        max_tokens=300,
        top_p=0.1
    )

    return response.choices[0].message.content
"""

In [None]:
img = images[2]
data_uri = get_img_uri(img)
data_uri


In [None]:
res = analyze_image(data_uri)
print(res)

# 폴더 내 모든 문서 프로세싱

In [None]:
files_path = "/content/drive/MyDrive/example_pdfs"

all_items = os.listdir(files_path)
files = [item for item in all_items if os.path.isfile(os.path.join(files_path, item))]

In [None]:
def analyze_doc_image(img):
    img_uri = get_img_uri(img)
    data = analyze_image(img_uri)
    return data

In [None]:
docs = []

for f in files:

    path = f"{files_path}/{f}"
    doc = {
        "filename": f
    }
    text = extract_text_from_doc(path)
    doc['text'] = text
    imgs = convert_doc_to_images(path)
    pages_description = []

    print(f"Analyzing pages for doc {f}")

    # Concurrent execution
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:

        # Removing 1st slide as it's usually just an intro
        futures = [
            executor.submit(analyze_doc_image, img)
            for img in imgs[1:]
        ]

        with tqdm(total=len(imgs)-1) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)

        for f in futures:
            res = f.result()
            pages_description.append(res)

    doc['pages_description'] = pages_description
    docs.append(doc)

In [None]:
# 결과 저장
json_path = "/content/drive/MyDrive/parsed_pdf_docs.json"

with open(json_path, 'w') as f:
    json.dump(docs, f)

In [None]:
with open(json_path, 'r') as f:
  docs = json.load(f)

# Embedding Content
- 청크 단위로 페이지 분할

In [None]:
# Chunking content by page and merging together slides text & description if applicable
content = []
for doc in docs:
    # Removing first slide as well
    text = doc['text'].split('\f')[1:]
    description = doc['pages_description']
    description_indexes = []
    for i in range(len(text)):
        slide_content = text[i] + '\n'
        # Trying to find matching slide description
        slide_title = text[i].split('\n')[0]
        for j in range(len(description)):
            description_title = description[j].split('\n')[0]
            if slide_title.lower() == description_title.lower():
                slide_content += description[j].replace(description_title, '')
                # Keeping track of the descriptions added
                description_indexes.append(j)
        # Adding the slide content + matching slide description to the content pieces
        content.append(slide_content)
    # Adding the slides descriptions that weren't used
    for j in range(len(description)):
        if j not in description_indexes:
            content.append(description[j])

In [None]:
for c in content:
    print(c)
    print("\n\n-------------------------------\n\n")

In [None]:
# 컨텐츠가 잘 안나오는 경우가 있으므로 보간 방법으로 해당 코드 실행

# Cleaning up content
# Removing trailing spaces, additional line breaks, page numbers and references to the content being a slide
clean_content = []
for c in content:
    text = c.replace(' \n', '').replace('\n\n', '\n').replace('\n\n\n', '\n').strip()
    text = re.sub(r"(?<=\n)\d{1,2}", "", text)
    text = re.sub(r"\b(?:the|this)\s*slide\s*\w+\b", "", text, flags=re.IGNORECASE)
    clean_content.append(text)

In [None]:
for c in clean_content:
    print(c)
    print("\n\n-------------------------------\n\n")

In [None]:
# 데이터 프레임 형식으로 변경
# Creating the embeddings
# We'll save to a csv file here for testing purposes but this is where you should load content in your vectorDB.
df = pd.DataFrame(clean_content, columns=['content'])
print(df.shape)
df.head()

In [None]:
embeddings_model = "text-embedding-3-large"

def get_embeddings(text):
    embeddings = client.embeddings.create(
      model="text-embedding-3-small",
      input=text,
      encoding_format="float"
    )
    return embeddings.data[0].embedding

In [None]:
df['embeddings'] = df['content'].apply(lambda x: get_embeddings(x))
df.head()

In [None]:
# Saving locally for later
data_path = "/content/drive/MyDrive/parsed_pdf_docs_with_embeddings.csv"
df.to_csv(data_path, index=False)

In [None]:
# Optional: load data from saved file
df = pd.read_csv(data_path)
df["embeddings"] = df.embeddings.apply(literal_eval).apply(np.array)

# Retrieval-augmented generation


In [None]:
system_prompt = '''
    You will be provided with an input prompt and content as context that can be used to reply to the prompt.

    You will do 2 things:

    1. First, you will internally assess whether the content provided is relevant to reply to the input prompt.

    2a. If that is the case, answer directly using this content. If the content is relevant, use elements found in the content to craft a reply to the input prompt.

    2b. If the content is not relevant, use your own knowledge to reply or say that you don't know how to respond if your knowledge is not sufficient to answer.

    Stay concise with your answer, replying specifically to the input prompt without mentioning additional information provided in the context content.
'''

#model="gpt-4-turbo-preview"
model="gpt-4o"


# 모델에 검색 요청
def search_content(df, input_text, top_k):
    embedded_value = get_embeddings(input_text)
    df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(np.array(x).reshape(1,-1), np.array(embedded_value).reshape(1, -1)))
    res = df.sort_values('similarity', ascending=False).head(top_k)
    return res

# 유사도 측정
def get_similarity(row):
    similarity_score = row['similarity']
    if isinstance(similarity_score, np.ndarray):
        similarity_score = similarity_score[0][0]
    return similarity_score

# 응답 생성
def generate_output(input_prompt, similar_content, threshold = 0.5):

    content = similar_content.iloc[0]['content']

    # Adding more matching content if the similarity is above threshold
    if len(similar_content) > 1:
        for i, row in similar_content.iterrows():
            similarity_score = get_similarity(row)
            if similarity_score > threshold:
                content += f"\n\n{row['content']}"

    prompt = f"INPUT PROMPT:\n{input_prompt}\n-------\nCONTENT:\n{content}"

    completion = client.chat.completions.create(
        model=model,
        temperature=0.5,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    return completion.choices[0].message.content

In [None]:
# Example user queries related to the content
example_inputs = [
    'What are the main models you offer?',
    'Do you have a speech recognition model?',
    'Which embedding model should I use for non-English use cases?',
    'Can I introduce new knowledge in my LLM app using RAG?',
    'How many examples do I need to fine-tune a model?',
    'Which metric can I use to evaluate a summarization task?',
    'Give me a detailed example for an evaluation process where we are looking for a clear answer to compare to a ground truth.',
]

In [None]:

# Running the RAG pipeline on each example
for ex in example_inputs:
    # 응답 포맷에 스타일 지정
    print(f"[deep_pink4][bold]QUERY:[/bold] {ex}[/deep_pink4]\n\n")
    matching_content = search_content(df, ex, 3) # 유사한것 3가지 조회
    print(f"[grey37][b]Matching content:[/b][/grey37]\n")
    for i, match in matching_content.iterrows():
        print(f"[grey37][i]Similarity: {get_similarity(match):.2f}[/i][/grey37]")
        print(f"[grey37]{match['content'][:100]}{'...' if len(match['content']) > 100 else ''}[/[grey37]]\n\n")
    reply = generate_output(ex, matching_content)
    print(f"[turquoise4][b]REPLY:[/b][/turquoise4]\n\n[spring_green4]{reply}[/spring_green4]\n\n--------------\n\n")