# Import

In [None]:
#Langchain
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
from langchain.prompts import PromptTemplate

#Lib Extract PDF
from PyPDF2 import PdfReader

from tqdm import tqdm
import pandas as pd
import time

# Model

In [None]:
project = "<project>"
location = "<location>"

In [None]:
llm = ChatAnthropicVertex(
    model_name="claude-3-5-sonnet@20240620",
    project=project,
    location=location,
    temperature=0.1,
    max_tokens=6046,
    timeout=None,
)

# Create Vector Store

In [None]:
template_clean_text = """
    
    Your task is to clean and restore the text extracted from a PDF document. \n
    The text may be poorly formatted, contain errors, or have inconsistent spacing and characters due to the conversion process. \n
    Your goal is to correct these issues and reconstruct the text so that it matches its original, \n
    intended form as closely as possible.
  
    DO NOT show "Here's the cleaned and restored version of the document:"
    
    this is my document: {raw_text}

    """
    
prompt_clean_text = PromptTemplate(template=template_clean_text)

In [None]:
template_keyword = """
    
    Your task is to extract potential keywords from a given document. keyword always in "Keyword ที่ user อาจใช้" , "ตัวอย่าง Error"
    
    You should answer in Thai Languages
    
    You Should generate potential questions that a customer might ask based on the content of a given document. Think about the different ways a customer might phrase their inquiries to find relevant information. If the document contains examples, use them to inspire additional questions. The goal is to anticipate all possible questions a customer might ask about the document.
    
    You should answer in format list:
    [
    "
    Keyword:
        - 
        - 
        - 
    "
    ,
    "
    Error:
        - 
        - 
        - 
    "
    ,
    "
    Question:
        -
        -
        -
    "
    ]
        
    DO NOT show Other keyword

    
    this is my document: {clean_texts}

    """
    
prompt_keyword = PromptTemplate(template=template_keyword)

## Chain

In [None]:
chain_clean_text = prompt_clean_text | llm
chain_keyword = prompt_keyword | llm

# Function in Pipeline

### Embedding model

In [None]:
import key_param as key_param
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = key_param.OPENAI_API_KEY
client = OpenAI()

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
   text = str(text)
   text = text.replace("\n", " ")
   
   return client.embeddings.create(input = [text], model=model).data[0].embedding

### Function

In [None]:
def text_to_list(text):
    """
    เปลี่ยนจาก list ให้เป็น Text
    """
    elements = text.strip("[]").split(",")
    return [element.strip().strip('"') for element in elements]

In [None]:
def read_PDF(folder_path, file_name):
    """
    อ่านไฟล์ PDF เพื่อนำไปเป็น Text ปกติ
    """

    path = folder_path + file_name
    reader = PdfReader(str(path))
    
    raw_text = ''
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text
            

    return raw_text

In [None]:
def clean_text(raw_text, chain_clean_text=chain_clean_text):
    """
    เปลี่ยน Raw Text เป็น Clean Text
    """

    fianl_answer = chain_clean_text.invoke(
        {
            "raw_text": {raw_text},
        }
    )

    return fianl_answer.content

In [None]:
def invoke_keyword(clean_texts, chain_keyword=chain_keyword):
    """
    นำ Clean Text มาทำเป็น Keyword
    """
    fianl_answer = chain_keyword.invoke(
        {
            "clean_texts": {clean_texts},
        }
    )

    answer = fianl_answer.content

    result = text_to_list(answer)
    
    return result

# Run Create Vector Store

In [None]:
def list_files_in_folder(folder_path):
    """
    อ่านไฟล์ PDF ทั้งหมด จาก Folder
    """
    try:
        file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        return file_names
    except FileNotFoundError:
        print(f"Folder '{folder_path}' not found.")
        return []
    except PermissionError:
        print(f"Permission denied to access '{folder_path}'.")
        return []

folder_path = './PDF/'
files = list_files_in_folder(folder_path)

In [None]:
# Build CSV
data = {
    "FileName": [],
    "Content": [],
    "Keyword": [],
    "ada_embedding": []
}

df = pd.DataFrame(data)

In [None]:
columns = ["FileName", "Content", "Keyword", "ada_embedding"]

for file_name in tqdm(files):
    
    raw_text = read_PDF(folder_path, file_name)
    
    clean_texts = clean_text(raw_text)
    
    keyword = invoke_keyword(clean_texts)
    
    for row in range(len(keyword)):
        
        embedding = get_embedding(keyword[row])
        
        rows = [file_name, clean_texts, keyword[row], embedding]  
        
        new_df = pd.DataFrame([rows], columns=columns)
        df = pd.concat([df, new_df], ignore_index=True)
    
    time.sleep(10) # care timeout

In [None]:
df.to_csv("PDF_VectorStore.csv", index=False)